From 48c20659de17a9ddb3b41f13c121056745023471 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Mon, 31 Aug 2020 19:22:03 +0000 Subject: [PATCH] August 31 updates --- .../tournament_node.py | 38 +- .../site-packages/markov/checkpoint_utils.py | 125 ---- .../site-packages/markov/constants.py | 4 - .../site-packages/markov/evaluation_worker.py | 46 +- .../markov/log_handler/constants.py | 14 +- .../markov/metrics/s3_metrics.py | 30 +- .../multi_agent_graph_manager.py | 56 +- .../site-packages/markov/rollout_worker.py | 92 ++- .../site-packages/markov/s3/__init__.py | 0 .../site-packages/markov/s3/constants.py | 71 ++- .../site-packages/markov/s3/files/__init__.py | 0 .../markov/s3/files/checkpoint.py | 174 ++++++ .../s3/files/checkpoint_files/__init__.py | 0 .../deepracer_checkpoint_json.py | 165 ++++++ .../checkpoint_files/rl_coach_checkpoint.py | 240 ++++++++ .../checkpoint_files/rl_coach_sync_file.py | 125 ++++ .../checkpoint_files/tensorflow_model.py | 416 +++++++++++++ .../markov/s3/files/hyperparameters.py | 2 +- .../markov/s3/files/ip_config.py | 14 +- .../site-packages/markov/s3/files/metrics.py | 3 +- .../markov/s3/files/simtrace_video.py | 4 +- .../site-packages/markov/s3/s3_client.py | 63 +- .../markov/s3_boto_data_store.py | 553 ++++++++---------- .../site-packages/markov/s3_client.py | 73 --- .../markov/samples/sample_collector.py | 49 +- .../site-packages/markov/tournament_worker.py | 55 +- .../site-packages/markov/training_worker.py | 108 ++-- .../python3.5/site-packages/markov/utils.py | 343 +---------- .../site-packages/markov/validation_worker.py | 71 ++- .../SOURCES.txt | 77 +-- 30 files changed, 1866 insertions(+), 1145 deletions(-) delete mode 100644 bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/checkpoint_utils.py create mode 100644 bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/__init__.py create mode 100644 bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/__init__.py create mode 100644 bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint.py create mode 100644 bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/__init__.py create mode 100644 bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/deepracer_checkpoint_json.py create mode 100644 bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/rl_coach_checkpoint.py create mode 100644 bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/rl_coach_sync_file.py create mode 100644 bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/tensorflow_model.py delete mode 100644 bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3_client.py diff --git a/bundle/deepracer_simulation_environment/lib/deepracer_simulation_environment/tournament_node.py b/bundle/deepracer_simulation_environment/lib/deepracer_simulation_environment/tournament_node.py index b4e69636..6422c2df 100755 --- a/bundle/deepracer_simulation_environment/lib/deepracer_simulation_environment/tournament_node.py +++ b/bundle/deepracer_simulation_environment/lib/deepracer_simulation_environment/tournament_node.py @@ -35,6 +35,7 @@ from markov.s3.constants import (MODEL_METADATA_LOCAL_PATH_FORMAT, MODEL_METADATA_S3_POSTFIX, YAML_LOCAL_PATH_FORMAT, AgentType, YamlKey) from markov.s3.utils import get_s3_key +from markov.s3.s3_client import S3Client logger = Logger(__name__, logging.INFO).get_logger() # Amount of time to wait to guarantee that RoboMaker's network configuration is ready. @@ -143,7 +144,7 @@ def main(): # create boto3 session/client and download yaml/json file session = boto3.session.Session() s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL", None) - s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url, config=get_boto_config()) + s3_client = S3Client(region_name=s3_region,s3_endpoint_url=s3_endpoint_url) # Intermediate tournament files queue_pickle_name = 'tournament_candidate_queue.pkl' @@ -158,12 +159,13 @@ def main(): final_report_s3_key = os.path.normpath(os.path.join(s3_prefix, final_report_name)) try: - s3_client.download_file(Bucket=s3_bucket, - Key=queue_pickle_s3_key, - Filename=local_queue_pickle_path) - s3_client.download_file(Bucket=s3_bucket, - Key=report_pickle_s3_key, - Filename=local_report_pickle_path) + s3_client.download_file(bucket=s3_bucket, + s3_key=queue_pickle_s3_key, + local_path=local_queue_pickle_path) + + s3_client.download_file(bucket=s3_bucket, + s3_key=report_pickle_s3_key, + local_path=local_report_pickle_path) except: pass @@ -307,15 +309,18 @@ def main(): # Persist latest queue and report to use after job restarts. with open(local_queue_pickle_path, 'wb') as f: pickle.dump(tournament_candidate_queue, f, protocol=2) - s3_client.upload_file(Filename=local_queue_pickle_path, - Bucket=s3_bucket, - Key=queue_pickle_s3_key, ExtraArgs=s3_extra_args) + s3_client.upload_file(bucket=s3_bucket, + s3_key=queue_pickle_s3_key, + local_path=local_queue_pickle_path, + s3_kms_extra_args=s3_extra_args) with open(local_report_pickle_path, 'wb') as f: pickle.dump(tournament_report, f, protocol=2) - s3_client.upload_file(Filename=local_report_pickle_path, - Bucket=s3_bucket, - Key=report_pickle_s3_key, ExtraArgs=s3_extra_args) + + s3_client.upload_file(bucket=s3_bucket, + s3_key=report_pickle_s3_key, + local_path=local_report_pickle_path, + s3_kms_extra_args=s3_extra_args) # If there is more than 1 candidates then restart the simulation job otherwise # tournament is finished, persists final report and ends the job. @@ -326,9 +331,10 @@ def main(): else: # Persist final tournament report in json format # and terminate the job by canceling it - s3_client.put_object(Bucket=s3_bucket, - Key=final_report_s3_key, - Body=json.dumps(tournament_report), **s3_extra_args) + s3_client.put_object(bucket=s3_bucket, + s3_key=final_report_s3_key, + body=json.dumps(tournament_report), + s3_kms_extra_args=s3_extra_args) cancel_simulation_job(os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'), s3_region) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/checkpoint_utils.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/checkpoint_utils.py deleted file mode 100644 index 577695ea..00000000 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/checkpoint_utils.py +++ /dev/null @@ -1,125 +0,0 @@ -import logging -import os -import time -import tensorflow as tf -import glob -import shutil -import re - -from markov import utils -from markov.log_handler.logger import Logger -from markov.log_handler.exception_handler import log_and_exit -from markov.log_handler.constants import (SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500, SIMAPP_EVENT_ERROR_CODE_400) -from rl_coach.checkpoint import CheckpointStateFile - -logger = Logger(__name__, logging.INFO).get_logger() -TEMP_RENAME_FOLDER = "./renamed_checkpoint" - -def rename_checkpoints(checkpoint_dir, agent_name): - ''' Helper method that rename the specific checkpoint in the CheckpointStateFile - to be scoped with agent_name - checkpoint_dir - local checkpoint folder where the checkpoints and .checkpoint file is stored - agent_name - name of the agent - ''' - try: - logger.info("Renaming checkpoint from checkpoint_dir: {} for agent: {}".format(checkpoint_dir, agent_name)) - state_file = CheckpointStateFile(os.path.abspath(checkpoint_dir)) - checkpoint_name = str(state_file.read()) - tf_checkpoint_file = os.path.join(checkpoint_dir, "checkpoint") - with open(tf_checkpoint_file, "w") as outfile: - outfile.write("model_checkpoint_path: \"{}\"".format(checkpoint_name)) - - config = tf.ConfigProto() - config.allow_soft_placement = True # allow placing ops on cpu if they are not fit for gpu - config.gpu_options.allow_growth = True # allow the gpu memory allocated for the worker to grow if needed - config.gpu_options.per_process_gpu_memory_fraction = 0.2 - config.intra_op_parallelism_threads = 1 - config.inter_op_parallelism_threads = 1 - - with tf.Session(config=config) as sess: - for var_name, _ in tf.contrib.framework.list_variables(checkpoint_dir): - # Load the variable - var = tf.contrib.framework.load_variable(checkpoint_dir, var_name) - new_name = var_name - # Set the new name - # Replace agent/ or agent_#/ with {agent_name}/ - new_name = re.sub('agent/|agent_\d+/', '{}/'.format(agent_name), new_name) - # Rename the variable - var = tf.Variable(var, name=new_name) - saver = tf.train.Saver() - sess.run(tf.global_variables_initializer()) - renamed_checkpoint_path = os.path.join(TEMP_RENAME_FOLDER, checkpoint_name) - logger.info('Saving updated checkpoint to {}'.format(renamed_checkpoint_path)) - saver.save(sess, renamed_checkpoint_path) - # Remove the tensorflow 'checkpoint' file - os.remove(tf_checkpoint_file) - # Remove the old checkpoint from the checkpoint dir - for file_name in os.listdir(checkpoint_dir): - if checkpoint_name in file_name: - os.remove(os.path.join(checkpoint_dir, file_name)) - # Copy the new checkpoint with renamed variable to the checkpoint dir - for file_name in os.listdir(TEMP_RENAME_FOLDER): - full_file_name = os.path.join(os.path.abspath(TEMP_RENAME_FOLDER), file_name) - if os.path.isfile(full_file_name) and file_name != "checkpoint": - shutil.copy(full_file_name, checkpoint_dir) - # Remove files from temp_rename_folder - shutil.rmtree(TEMP_RENAME_FOLDER) - tf.reset_default_graph() - # If either of the checkpoint files (index, meta or data) not found - except tf.errors.NotFoundError as err: - log_and_exit("No checkpoint found: {}".format(err), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - # Thrown when user modifies model, checkpoints get corrupted/truncated - except tf.errors.DataLossError as err: - log_and_exit("User modified ckpt, unrecoverable dataloss or corruption: {}" - .format(err), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - except tf.errors.OutOfRangeError as err: - log_and_exit("User modified ckpt: {}" - .format(err), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - except ValueError as err: - if utils.is_user_error(err): - log_and_exit("Couldn't find 'checkpoint' file or checkpoints in given \ - directory ./checkpoint: {}".format(err), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - else: - log_and_exit("ValueError in rename checkpoint: {}".format(err), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) - except Exception as ex: - log_and_exit("Exception in rename checkpoint: {}".format(ex), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) - -def modify_checkpoint_variables(checkpoint_dirs, agent_names): - for checkpoint_dir, agent_name in zip(checkpoint_dirs, agent_names): - rename_checkpoints(checkpoint_dir, agent_name) - -def wait_for_checkpoints(checkpoint_dirs, data_store=None, timeout=10): - """ - block until there is a checkpoint in all of the checkpoint_dirs - """ - chkpt_state_files = [CheckpointStateFile(checkpoint_dir) for checkpoint_dir in checkpoint_dirs] - for i in range(timeout): - if data_store: - data_store.load_from_store() - all_agent_checkpoint_copied = all([chkpt_state_file.read() is not None for chkpt_state_file in chkpt_state_files]) - if all_agent_checkpoint_copied: - return - time.sleep(10) - - # one last time - all_agent_checkpoint_copied = all([chkpt_state_file.read() is not None for chkpt_state_file in chkpt_state_files]) - if all_agent_checkpoint_copied: - return - - log_and_exit("Checkpoint never found in {} : {}, waited {} seconds." \ - .format(checkpoint_dirs, all_agent_checkpoint_copied, timeout), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/constants.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/constants.py index 027d4cc2..d12b44a4 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/constants.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/constants.py @@ -10,10 +10,6 @@ # The robomaker team has asked us to wait 5 minutes to let their workflow cancel # the simulation job ROBOMAKER_CANCEL_JOB_WAIT_TIME = 60 * 5 -# The current checkpoint key -CHKPNT_KEY_SUFFIX = "model/.coach_checkpoint" -# This is the key for the best checkpoint -DEEPRACER_CHKPNT_KEY_SUFFIX = "model/deepracer_checkpoints.json" # The number of times to retry a failed boto call NUM_RETRIES = 5 # The time in seconds till a timeout exception is thrown when attempting to make a connection diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/evaluation_worker.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/evaluation_worker.py index f2a8958d..4229f50c 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/evaluation_worker.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/evaluation_worker.py @@ -30,7 +30,6 @@ configure_environment_randomizer, get_robomaker_profiler_env) from markov.rospy_wrappers import ServiceProxyWrapper from markov.camera_utils import configure_camera -from markov.checkpoint_utils import TEMP_RENAME_FOLDER, wait_for_checkpoints, modify_checkpoint_variables from markov.track_geom.track_data import TrackData from markov.track_geom.utils import get_start_positions from markov.s3.constants import (MODEL_METADATA_LOCAL_PATH_FORMAT, @@ -42,6 +41,7 @@ SimtraceVideoNames) from markov.s3.files.model_metadata import ModelMetadata from markov.s3.files.simtrace_video import SimtraceVideo +from markov.s3.files.checkpoint import Checkpoint from markov.s3.utils import get_s3_key from std_srvs.srv import Empty, EmptyRequest @@ -50,9 +50,6 @@ MIN_RESET_COUNT = 10000 #TODO: change when console passes float("inf") -if not os.path.exists(TEMP_RENAME_FOLDER): - os.makedirs(TEMP_RENAME_FOLDER) - IS_PROFILER_ON, PROFILER_S3_BUCKET, PROFILER_S3_PREFIX = get_robomaker_profiler_env() def evaluation_worker(graph_manager, number_of_trials, task_parameters, simtrace_video_s3_writers, is_continuous, @@ -71,21 +68,15 @@ def evaluation_worker(graph_manager, number_of_trials, task_parameters, simtrace # Collect profiler information only IS_PROFILER_ON is true with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET, s3_prefix=PROFILER_S3_PREFIX, output_local_path=ROLLOUT_WORKER_PROFILER_PATH, enable_profiling=IS_PROFILER_ON): - checkpoint_dirs = list() - agent_names = list() subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list(), list() subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list() for agent_param in graph_manager.agents_params: - _checkpoint_dir = task_parameters.checkpoint_restore_path if len(graph_manager.agents_params) == 1 \ - else os.path.join(task_parameters.checkpoint_restore_path, agent_param.name) - agent_names.append(agent_param.name) - checkpoint_dirs.append(_checkpoint_dir) racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \ else "racecar_{}".format(agent_param.name.split("_")[1]) subscribe_to_save_mp4_topic.append("/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name)) unsubscribe_from_save_mp4_topic.append("/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name)) - wait_for_checkpoints(checkpoint_dirs, graph_manager.data_store) - modify_checkpoint_variables(checkpoint_dirs, agent_names) + graph_manager.data_store.wait_for_checkpoints() + graph_manager.data_store.modify_checkpoint_variables() # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') @@ -253,6 +244,7 @@ def main(): agent_list = list() s3_bucket_dict = dict() s3_prefix_dict = dict() + checkpoint_dict = dict() start_positions = get_start_positions(len(arg_s3_bucket)) done_condition = utils.str_to_done_condition(rospy.get_param("DONE_CONDITION", any)) park_positions = utils.pos_2d_str_to_list(rospy.get_param("PARK_POSITIONS", [])) @@ -274,11 +266,24 @@ def main(): _, _, version = model_metadata.get_model_metadata_info() - # Select the optimal model - utils.do_model_selection(s3_bucket=arg_s3_bucket[agent_index], + # checkpoint s3 instance + checkpoint = Checkpoint(bucket=arg_s3_bucket[agent_index], s3_prefix=arg_s3_prefix[agent_index], - region=args.aws_region, - s3_endpoint_url=args.s3_endpoint_url) + region_name=args.aws_region, + s3_endpoint_url=args.s3_endpoint_url, + agent_name=agent_name, + checkpoint_dir=args.local_model_directory) + # make coach checkpoint compatible + if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible(): + checkpoint.rl_coach_checkpoint.make_compatible(checkpoint.syncfile_ready) + # get best model checkpoint string + model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint() + # Select the best checkpoint model by uploading rl coach .coach_checkpoint file + checkpoint.rl_coach_checkpoint.update( + model_checkpoint_name=model_checkpoint_name, + s3_kms_extra_args=utils.get_s3_kms_extra_args()) + + checkpoint_dict[agent_name] = checkpoint agent_config = { 'model_metadata': model_metadata, @@ -364,13 +369,10 @@ def main(): enable_domain_randomization=enable_domain_randomization, done_condition=done_condition) - ds_params_instance = S3BotoDataStoreParameters(aws_region=args.aws_region, - bucket_names=s3_bucket_dict, - base_checkpoint_dir=args.local_model_directory, - s3_folders=s3_prefix_dict, - s3_endpoint_url=args.s3_endpoint_url) + ds_params_instance = S3BotoDataStoreParameters(checkpoint_dict=checkpoint_dict) - graph_manager.data_store = S3BotoDataStore(params=ds_params_instance, graph_manager=graph_manager, + graph_manager.data_store = S3BotoDataStore(params=ds_params_instance, + graph_manager=graph_manager, ignore_lock=True) graph_manager.env_params.seed = 0 diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/log_handler/constants.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/log_handler/constants.py index 662a458f..11e98f16 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/log_handler/constants.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/log_handler/constants.py @@ -59,8 +59,8 @@ 32: "No objects found", 33: "No checkpoint file found", 34: "Unable to make model compatible", - 35: "Checkpoint never found in", - 36: "Download params and launch of agent node S3 ClientError", + 35: "Checkpoint never found", + 36: "Failed to parse model_metadata file", 37: "Validation worker value error", 38: "Unable to write metrics to s3: bucket", 39: "Unable to write metrics to s3, exception", @@ -107,7 +107,15 @@ 80: "Exception in putting objects", 81: "Unable to upload fileobj", 82: "Unable to list objects", - 83: "Unable to put object" + 83: "Unable to put object", + 84: "Exception in uploading .finished file", + 85: "Exception in uploading .lock file", + 86: "Exception in uploading .ready file", + 87: "Unable to delete object from s3", + 88: "Can't download deepracer checkpoint json", + 89: "ready never found", + 90: "Exception in downloading .ready", + 91: "Unable to paginate from s3" } # New error yet to be mapped diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/metrics/s3_metrics.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/metrics/s3_metrics.py index 6d93ed2e..e5770b9f 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/metrics/s3_metrics.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/metrics/s3_metrics.py @@ -58,21 +58,20 @@ def write_simtrace_to_local_file(file_path: str, metrics_data: OrderedDict): class TrainingMetrics(MetricsInterface, ObserverInterface, AbstractTracker): '''This class is responsible for uploading training metrics to s3''' - def __init__(self, agent_name, s3_dict_metrics, s3_dict_model, ckpnt_dir, run_phase_sink, use_model_picker=True): + def __init__(self, agent_name, s3_dict_metrics, deepracer_checkpoint_json, ckpnt_dir, run_phase_sink, use_model_picker=True): '''s3_dict_metrics - Dictionary containing the required s3 info for the metrics bucket with keys specified by MetricsS3Keys - s3_dict_model - Dictionary containing the required s3 info for the model - bucket, which is where the best model info will be saved with - keys specified by MetricsS3Keys + deepracer_checkpoint_json - DeepracerCheckpointJson instance ckpnt_dir - Directory where the current checkpont is to be stored run_phase_sink - Sink to recieve notification of a change in run phase use_model_picker - Flag to whether to use model picker or not. ''' self._agent_name_ = agent_name + self._deepracer_checkpoint_json = deepracer_checkpoint_json self._s3_metrics = Metrics(bucket=s3_dict_metrics[MetricsS3Keys.METRICS_BUCKET.value], s3_key=s3_dict_metrics[MetricsS3Keys.METRICS_KEY.value], region_name=s3_dict_metrics[MetricsS3Keys.REGION.value], - s3_endpoint_url=[MetricsS3Keys.ENDPOINT_URL.value]) + s3_endpoint_url=s3_dict_metrics[MetricsS3Keys.ENDPOINT_URL.value]) self._start_time_ = time.time() self._episode_ = 0 self._episode_reward_ = 0.0 @@ -180,10 +179,10 @@ def update(self, data): last_chkpnt_stats = {'name': self._eval_stats_dict_['chkpnt_name'], 'avg_comp_pct': mean_pct, 'time_stamp': time_stamp} - json_metrics = json.dumps({BEST_CHECKPOINT: self._best_chkpnt_stats, - LAST_CHECKPOINT: last_chkpnt_stats}) - self._s3_deepracer_json_metrics.persist(body=json_metrics, - s3_kms_extra_args=get_s3_kms_extra_args()) + self._deepracer_checkpoint_json.persist( + body=json.dumps({BEST_CHECKPOINT: self._best_chkpnt_stats, + LAST_CHECKPOINT: last_chkpnt_stats}), + s3_kms_extra_args=get_s3_kms_extra_args()) # Update the checkpoint name to the new checkpoint being used for training that will # then be evaluated, note this class gets notfied when the system is put into a # training phase and assumes that a training phase only starts when a new check point @@ -228,7 +227,7 @@ def _handle_get_video_metrics(self, req): self._video_metrics[Mp4VideoMetrics.Y.value], self._video_metrics[Mp4VideoMetrics.OBJECT_LOCATIONS.value]) -class EvalMetrics(MetricsInterface , AbstractTracker): +class EvalMetrics(MetricsInterface, AbstractTracker): '''This class is responsible for uploading eval metrics to s3''' def __init__(self, agent_name, s3_dict_metrics, is_continuous): '''Init eval metrics @@ -240,8 +239,11 @@ def __init__(self, agent_name, s3_dict_metrics, is_continuous): is_continuous (bool): True if continuous race, False otherwise ''' self._agent_name_ = agent_name - self._s3_dict_metrics_ = s3_dict_metrics - self._is_continuous = is_continuous + self._s3_metrics = Metrics(bucket=s3_dict_metrics[MetricsS3Keys.METRICS_BUCKET.value], + s3_key=s3_dict_metrics[MetricsS3Keys.METRICS_KEY.value], + region_name=s3_dict_metrics[MetricsS3Keys.REGION.value], + s3_endpoint_url=s3_dict_metrics[MetricsS3Keys.ENDPOINT_URL.value]) + self._is_continuous = is_continuous self._start_time_ = time.time() self._number_of_trials_ = 0 self._progress_ = 0.0 @@ -265,10 +267,9 @@ def __init__(self, agent_name, s3_dict_metrics, is_continuous): self._video_metrics = Mp4VideoMetrics.get_empty_dict() self._reset_count_sum = 0 self._current_sim_time = 0 - self.track_data = TrackData.get_instance() + self.track_data = TrackData.get_instance() rospy.Service("/{}/{}".format(self._agent_name_, "mp4_video_metrics"), VideoMetricsSrv, self._handle_get_video_metrics) - AbstractTracker.__init__(self, TrackerPriority.HIGH) def update_tracker(self, delta_time, sim_time): @@ -281,7 +282,6 @@ def update_tracker(self, delta_time, sim_time): """ self._current_sim_time = sim_time.clock.secs + 1.e-9 * sim_time.clock.nsecs - def reset(self): self._start_time_ = self._current_sim_time self._reset_count_sum += \ diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/multi_agent_coach/multi_agent_graph_manager.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/multi_agent_coach/multi_agent_graph_manager.py index 0f401b2f..2311d19a 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/multi_agent_coach/multi_agent_graph_manager.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/multi_agent_coach/multi_agent_graph_manager.py @@ -36,7 +36,7 @@ def __init__(self, agents_params: List[AgentParameters], env_params: Environment preset_validation_params: PresetValidationParameters = PresetValidationParameters(), done_condition=any): self.done_condition = done_condition - self.sess = {agent_params.name: None for agent_params in agents_params} + self.sess = None self.level_managers = [] # type: List[MultiAgentLevelManager] self.top_level_manager = None self.environments = [] @@ -61,7 +61,7 @@ def __init__(self, agents_params: List[AgentParameters], env_params: Environment } self.checkpoint_id = 0 - self.checkpoint_saver = {agent_params.name: None for agent_params in agents_params} + self.checkpoint_saver = None self.checkpoint_state_updater = None self.graph_logger = Logger() self.data_store = None @@ -248,6 +248,7 @@ def create_session(self, task_parameters: TaskParameters): # Create parameter saver self.checkpoint_saver = {agent_params.name: SaverCollection() for agent_params in self.agents_params} + self.checkpoint_state_updater = {agent_params.name: None for agent_params in self.agents_params} for level in self.level_managers: for agent_params in self.agents_params: self.checkpoint_saver[agent_params.name].update(level.collect_savers(agent_params.name)) @@ -538,10 +539,8 @@ def restore_checkpoint(self): if self.task_parameters.checkpoint_restore_path: restored_checkpoint_paths = [] for agent_params in self.agents_params: - if len(self.agents_params) == 1: - agent_checkpoint_restore_path = self.task_parameters.checkpoint_restore_path - else: - agent_checkpoint_restore_path = os.path.join(self.task_parameters.checkpoint_restore_path, agent_params.name) + # for single agent name is 'agent'. For multi agent name is 'agent_0' ... + agent_checkpoint_restore_path = os.path.join(self.task_parameters.checkpoint_restore_path, agent_params.name) if os.path.isdir(agent_checkpoint_restore_path): # a checkpoint dir if self.task_parameters.framework_type == Frameworks.tensorflow and\ @@ -620,29 +619,26 @@ def save_checkpoint(self): if not os.path.exists(self.task_parameters.checkpoint_save_dir): os.mkdir(self.task_parameters.checkpoint_save_dir) # Create directory structure - if self.checkpoint_state_updater is None: - self.checkpoint_state_updater = CheckpointStateUpdater(self.task_parameters.checkpoint_save_dir) - checkpoint_name = "{}_Step-{}.ckpt".format( self.checkpoint_id, self.total_steps_counters[RunPhase.TRAIN][EnvironmentSteps]) saved_checkpoint_paths = [] for agent_params in self.agents_params: - if len(self.agents_params) == 1: - agent_checkpoint_save_dir =self.task_parameters.checkpoint_save_dir - else: - agent_checkpoint_save_dir = os.path.join(self.task_parameters.checkpoint_save_dir, agent_params.name) + agent_checkpoint_save_dir = os.path.join(self.task_parameters.checkpoint_save_dir, agent_params.name) if not os.path.exists(agent_checkpoint_save_dir): os.mkdir(agent_checkpoint_save_dir) + if self.checkpoint_state_updater[agent_params.name] is None: + self.checkpoint_state_updater[agent_params.name] = CheckpointStateUpdater(agent_checkpoint_save_dir) + agent_checkpoint_path = os.path.join(agent_checkpoint_save_dir, checkpoint_name) if not isinstance(self.task_parameters, DistributedTaskParameters): saved_checkpoint_paths.append(self.checkpoint_saver[agent_params.name].save(self.sess[agent_params.name], agent_checkpoint_path)) else: saved_checkpoint_paths.append(agent_checkpoint_path) - if self.num_checkpoints_to_keep < len(self.checkpoint_state_updater.all_checkpoints): - checkpoint_to_delete = self.checkpoint_state_updater.all_checkpoints[-self.num_checkpoints_to_keep - 1] + if self.num_checkpoints_to_keep < len(self.checkpoint_state_updater[agent_params.name].all_checkpoints): + checkpoint_to_delete = self.checkpoint_state_updater[agent_params.name].all_checkpoints[-self.num_checkpoints_to_keep - 1] agent_checkpoint_to_delete = os.path.join(agent_checkpoint_save_dir, checkpoint_to_delete.name) for file in glob.glob("{}*".format(agent_checkpoint_to_delete)): os.remove(file) @@ -659,7 +655,8 @@ def save_checkpoint(self): self.save_onnx_graph() # write the new checkpoint name to a file to signal this checkpoint has been fully saved - self.checkpoint_state_updater.update(SingleCheckpoint(self.checkpoint_id, checkpoint_name)) + for agent_params in self.agents_params: + self.checkpoint_state_updater[agent_params.name].update(SingleCheckpoint(self.checkpoint_id, checkpoint_name)) screen.log_dict( OrderedDict([ @@ -753,11 +750,19 @@ def get_data_store(self, param): return data_store_creator(param) def signal_ready(self): + # save .ready locally for centralized training. When a training job starts, + # rollout_worker.py will wait for the .ready file while training_worker.py will write + # .ready file here. For distributed training, .ready will be uploaded to the s3 + # while for centrazlied training, .ready file will be saved locally. if self.task_parameters.checkpoint_save_dir and os.path.exists(self.task_parameters.checkpoint_save_dir): - open(os.path.join(self.task_parameters.checkpoint_save_dir, SyncFiles.TRAINER_READY.value), 'w').close() + for agent_params in self.agents_params: + agent_checkpoint_save_dir = os.path.join(self.task_parameters.checkpoint_save_dir, agent_params.name) + if not os.path.exists(agent_checkpoint_save_dir): + os.mkdir(agent_checkpoint_save_dir) + open(os.path.join(agent_checkpoint_save_dir, SyncFiles.TRAINER_READY.value), 'w').close() if hasattr(self, 'data_store_params'): - data_store = self.get_data_store(self.data_store_params) - data_store.save_to_store() + data_store = self.get_data_store(self.data_store_params) + data_store.signal_ready() def close(self) -> None: """ @@ -779,11 +784,20 @@ def flush_finished(self): To indicate the training has finished, writes a `.finished` file to the checkpoint directory and calls the data store to updload that file. """ + # save .finished locally for centralized training. When a training job ends, + # rollout_worker.py will check .finished to persist file and exit + # while training_worker.py will write .finished file here. + # For distributed training, .finished will be uploaded to the s3 + # while for centrazlied training, .finished file will be saved locally. if self.task_parameters.checkpoint_save_dir and os.path.exists(self.task_parameters.checkpoint_save_dir): - open(os.path.join(self.task_parameters.checkpoint_save_dir, SyncFiles.FINISHED.value), 'w').close() + for agent_params in self.agents_params: + agent_checkpoint_save_dir = os.path.join(self.task_parameters.checkpoint_save_dir, agent_params.name) + if not os.path.exists(agent_checkpoint_save_dir): + os.mkdir(agent_checkpoint_save_dir) + open(os.path.join(agent_checkpoint_save_dir, SyncFiles.FINISHED.value), 'w').close() if hasattr(self, 'data_store_params'): data_store = self.get_data_store(self.data_store_params) - data_store.save_to_store() + data_store.flush_finished() def set_schedule_params(self, schedule_params: ScheduleParameters): """ diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/rollout_worker.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/rollout_worker.py index 1bc96b5e..ac6e3e65 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/rollout_worker.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/rollout_worker.py @@ -13,17 +13,18 @@ import time import rospy import future_fstrings +import botocore from rl_coach.base_parameters import TaskParameters, DistributedCoachSynchronizationType, RunType from rl_coach.checkpoint import CheckpointStateReader from rl_coach.core_types import RunPhase, EnvironmentSteps from rl_coach.logger import screen -from rl_coach.rollout_worker import wait_for_checkpoint, wait_for_trainer_ready, should_stop +from rl_coach.rollout_worker import should_stop from rl_coach.utils import short_dynamic_import from rl_coach.core_types import EnvironmentEpisodes from markov import utils -from markov.constants import DEEPRACER_CHKPNT_KEY_SUFFIX, ROLLOUT_WORKER_PROFILER_PATH +from markov.constants import ROLLOUT_WORKER_PROFILER_PATH from markov.log_handler.logger import Logger from markov.log_handler.exception_handler import log_and_exit, simapp_exit_gracefully from markov.log_handler.constants import (SIMAPP_SIMULATION_WORKER_EXCEPTION, @@ -39,7 +40,6 @@ from markov.metrics.iteration_data import IterationData from markov.metrics.constants import MetricsS3Keys from markov.s3_boto_data_store import S3BotoDataStore, S3BotoDataStoreParameters -from markov.s3_client import SageS3Client from markov.sagemaker_graph_manager import get_graph_manager from markov.rospy_wrappers import ServiceProxyWrapper from markov.camera_utils import configure_camera @@ -49,6 +49,7 @@ from markov.s3.files.reward_function import RewardFunction from markov.s3.files.simtrace_video import SimtraceVideo from markov.s3.files.ip_config import IpConfig +from markov.s3.files.checkpoint import Checkpoint from markov.s3.utils import get_s3_key from markov.s3.constants import (HYPERPARAMETER_LOCAL_PATH_FORMAT, MODEL_METADATA_LOCAL_PATH_FORMAT, @@ -60,6 +61,7 @@ CAMERA_TOPVIEW_LOCAL_PATH_FORMAT, SimtraceVideoNames, IP_ADDRESS_LOCAL_PATH) +from markov.s3.s3_client import S3Client from std_srvs.srv import Empty, EmptyRequest logger = Logger(__name__, logging.INFO).get_logger() @@ -72,16 +74,39 @@ IS_PROFILER_ON, PROFILER_S3_BUCKET, PROFILER_S3_PREFIX = get_robomaker_profiler_env() -def download_custom_files_if_present(s3_client, s3_prefix): - environment_file_s3_key = os.path.normpath(s3_prefix + "/environments/deepracer_racetrack_env.py") - environment_local_path = os.path.join(CUSTOM_FILES_PATH, "deepracer_racetrack_env.py") - success_environment_download = s3_client.download_file(s3_key=environment_file_s3_key, - local_path=environment_local_path) - preset_file_s3_key = os.path.normpath(s3_prefix + "/presets/preset.py") - preset_local_path = os.path.join(CUSTOM_FILES_PATH, "preset.py") - success_preset_download = s3_client.download_file(s3_key=preset_file_s3_key, - local_path=preset_local_path) +def download_custom_files_if_present(s3_bucket, s3_prefix, aws_region, s3_endpoint_url): + '''download custom environment and preset files + + Args: + s3_bucket (str): s3 bucket string + s3_prefix (str): s3 prefix string + aws_region (str): aws region string + + Returns: + tuple (bool, bool): tuple of bool on whether preset and environemnt + is downloaded successfully + ''' + success_environment_download, success_preset_download = False, False + try: + s3_client = S3Client(region_name=aws_region, s3_endpoint_url=s3_endpoint_url, max_retry_attempts=0) + environment_file_s3_key = os.path.normpath(s3_prefix + "/environments/deepracer_racetrack_env.py") + environment_local_path = os.path.join(CUSTOM_FILES_PATH, "deepracer_racetrack_env.py") + s3_client.download_file(bucket=s3_bucket, + s3_key=environment_file_s3_key, + local_path=environment_local_path) + success_environment_download = True + except botocore.exceptions.ClientError: + pass + try: + preset_file_s3_key = os.path.normpath(s3_prefix + "/presets/preset.py") + preset_local_path = os.path.join(CUSTOM_FILES_PATH, "preset.py") + s3_client.download_file(bucket=s3_bucket, + s3_key=preset_file_s3_key, + local_path=preset_local_path) + success_preset_download = True + except botocore.exceptions.ClientError: + pass return success_preset_download, success_environment_download def exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers, rollout_idx): @@ -109,9 +134,10 @@ def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters, sim data_store = graph_manager.data_store - checkpoint_dir = task_parameters.checkpoint_restore_path - wait_for_checkpoint(checkpoint_dir, data_store) - wait_for_trainer_ready(checkpoint_dir, data_store) + #TODO change agent to specific agent name for multip agent case + checkpoint_dir = os.path.join(task_parameters.checkpoint_restore_path, "agent") + graph_manager.data_store.wait_for_checkpoints() + graph_manager.data_store.wait_for_trainer_ready() # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') rospy.wait_for_service('/gazebo/unpause_physics_dr') @@ -168,7 +194,7 @@ def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters, sim graph_manager.evaluate(EnvironmentSteps(1)) else: time.sleep(5) - new_checkpoint = data_store.get_chkpoint_num('agent') + new_checkpoint = data_store.get_coach_checkpoint_number('agent') if is_save_mp4_enabled: unsubscribe_from_save_mp4(EmptyRequest()) logger.info("Completed iteration tasks. Writing results to S3.") @@ -277,7 +303,6 @@ def main(): args = parser.parse_args() - s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region, s3_endpoint_url=args.s3_endpoint_url) logger.info("S3 bucket: %s", args.s3_bucket) logger.info("S3 prefix: %s", args.s3_prefix) logger.info("S3 endpoint URL: %s" % args.s3_endpoint_url) @@ -294,7 +319,10 @@ def main(): # Instantiate Cameras configure_camera(namespaces=['racecar']) - preset_file_success, _ = download_custom_files_if_present(s3_client, args.s3_prefix) + preset_file_success, _ = download_custom_files_if_present(s3_bucket=args.s3_bucket, + s3_prefix=args.s3_prefix, + aws_region=args.aws_region, + s3_endpoint_url=args.s3_endpoint_url) # download model metadata # TODO: replace 'agent' with name of each agent @@ -340,18 +368,27 @@ def main(): MetricsS3Keys.METRICS_KEY.value: metrics_key, MetricsS3Keys.ENDPOINT_URL.value: rospy.get_param('S3_ENDPOINT_URL', None), MetricsS3Keys.REGION.value: rospy.get_param('AWS_REGION')} - metrics_s3_model_cfg = {MetricsS3Keys.METRICS_BUCKET.value: args.s3_bucket, - MetricsS3Keys.METRICS_KEY.value: os.path.join(args.s3_prefix, - DEEPRACER_CHKPNT_KEY_SUFFIX), - MetricsS3Keys.REGION.value: args.aws_region} + run_phase_subject = RunPhaseSubject() agent_list = list() + + #TODO: replace agent for multi agent training + # checkpoint s3 instance + # TODO replace agent with agent_0 and so on for multiagent case + checkpoint = Checkpoint(bucket=args.s3_bucket, + s3_prefix=args.s3_prefix, + region_name=args.aws_region, + s3_endpoint_url=args.s3_endpoint_url, + agent_name='agent', + checkpoint_dir=args.checkpoint_dir) + + agent_list.append(create_rollout_agent(agent_config, TrainingMetrics(agent_name='agent', s3_dict_metrics=metrics_s3_config, - s3_dict_model=metrics_s3_model_cfg, - ckpnt_dir=args.checkpoint_dir, + deepracer_checkpoint_json=checkpoint.deepracer_checkpoint_json, + ckpnt_dir=os.path.join(args.checkpoint_dir, 'agent'), run_phase_sink=run_phase_subject, use_model_picker=(args.rollout_idx == 0)), run_phase_subject)) @@ -456,11 +493,8 @@ def main(): graph_manager.memory_backend_params = memory_backend_params - ds_params_instance = S3BotoDataStoreParameters(aws_region=args.aws_region, - bucket_names={'agent':args.s3_bucket}, - base_checkpoint_dir=args.checkpoint_dir, - s3_folders={'agent':args.s3_prefix}, - s3_endpoint_url=args.s3_endpoint_url) + checkpoint_dict = {'agent': checkpoint} + ds_params_instance = S3BotoDataStoreParameters(checkpoint_dict=checkpoint_dict) graph_manager.data_store = S3BotoDataStore(ds_params_instance, graph_manager) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/__init__.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/constants.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/constants.py index 9bc9b9e1..d02bb9c1 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/constants.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/constants.py @@ -39,12 +39,14 @@ # replace {} with yaml file name YAML_LOCAL_PATH_FORMAT = os.path.join(CUSTOM_FILES_PATH, "{}") + class AgentType(Enum): '''agent types for simapp''' ROLLOUT = "rollout" EVALUATION = "evaluation" TOURNAMENT = "tournament" + class YamlKey(Enum): '''yaml key for all types of workers''' RACE_TYPE_YAML_KEY = "RACE_TYPE" @@ -68,6 +70,7 @@ class YamlKey(Enum): SAGEMAKER_SHARED_S3_BUCKET_YAML_KEY = "SAGEMAKER_SHARED_S3_BUCKET" SAGEMAKER_SHARED_S3_PREFIX_YAML_KEY = "SAGEMAKER_SHARED_S3_PREFIX" + EVAL_MANDATORY_YAML_KEY = [YamlKey.MODEL_S3_BUCKET_YAML_KEY.value, YamlKey.MODEL_S3_PREFIX_YAML_KEY.value, YamlKey.METRICS_S3_BUCKET_YAML_KEY.value, @@ -119,6 +122,7 @@ class YamlKey(Enum): CAMERA_TOPVIEW_LOCAL_PATH_FORMAT = os.path.join(CUSTOM_FILES_PATH, "iteration_data/{}/camera-topview/video.mp4") + class SimtraceVideoNames(Enum): SIMTRACE_EVAL = 'simtrace_eval' SIMTRACE_TRAINING = 'simtrace_training' @@ -126,6 +130,7 @@ class SimtraceVideoNames(Enum): DEGREE45 = 'degree45' TOPVIEW = 'topview' + # simtrace video dict SIMTRACE_VIDEO_POSTFIX_DICT = \ {SimtraceVideoNames.SIMTRACE_EVAL.value: SIMTRACE_EVAL_S3_POSTFIX, @@ -137,10 +142,74 @@ class SimtraceVideoNames(Enum): ############################# # ip config upload/download # ############################# -SAGEMAKER_WAIT_TIME = 1200 # 20 minutes +# 20 minutes +SAGEMAKER_WAIT_TIME = 1200 IP_ADDRESS_POSTFIX = "ip/ip.json" IP_DONE_POSTFIX = "ip/done" IP_ADDRESS_LOCAL_PATH = os.path.join(CUSTOM_FILES_PATH, 'ip.json') + +############## +# checkpoint # +############## +# best and last checkpoint +BEST_CHECKPOINT = 'best_checkpoint' +LAST_CHECKPOINT = 'last_checkpoint' + + +# sync files +class SyncFiles(Enum): + FINISHED = ".finished" + LOCKFILE = ".lock" + TRAINER_READY = ".ready" + + +# s3 postfix +CHECKPOINT_POSTFIX_DIR = "model" +COACH_CHECKPOINT_POSTFIX = os.path.join(CHECKPOINT_POSTFIX_DIR, + ".coach_checkpoint") +OLD_COACH_CHECKPOINT_POSTFIX = os.path.join(CHECKPOINT_POSTFIX_DIR, + "checkpoint") +DEEPRACER_CHECKPOINT_KEY_POSTFIX = os.path.join(CHECKPOINT_POSTFIX_DIR, + "deepracer_checkpoints.json") +FINISHED_FILE_KEY_POSTFIX = os.path.join(CHECKPOINT_POSTFIX_DIR, + SyncFiles.FINISHED.value) +LOCKFILE_KEY_POSTFIX = os.path.join(CHECKPOINT_POSTFIX_DIR, + SyncFiles.LOCKFILE.value) +TRAINER_READY_KEY_POSTFIX = os.path.join(CHECKPOINT_POSTFIX_DIR, + SyncFiles.TRAINER_READY.value) + +# SyncFiles s3 post dict +SYNC_FILES_POSTFIX_DICT = {SyncFiles.FINISHED.value: FINISHED_FILE_KEY_POSTFIX, + SyncFiles.LOCKFILE.value: LOCKFILE_KEY_POSTFIX, + SyncFiles.TRAINER_READY.value: TRAINER_READY_KEY_POSTFIX} + +# {} should be replaced by ./checkpoint_folder/agent_name +CHECKPOINT_LOCAL_DIR_FORMAT = os.path.join("{}") +COACH_CHECKPOINT_LOCAL_PATH_FORMAT = os.path.join(CHECKPOINT_LOCAL_DIR_FORMAT, + ".coach_checkpoint") +TEMP_COACH_CHECKPOINT_LOCAL_PATH_FORMAT = os.path.join(CHECKPOINT_LOCAL_DIR_FORMAT, + ".temp_coach_checkpoint") +OLD_COACH_CHECKPOINT_LOCAL_PATH_FORMAT = os.path.join(CHECKPOINT_LOCAL_DIR_FORMAT, + "checkpoint") +DEEPRACER_CHECKPOINT_LOCAL_PATH_FORMAT = os.path.join(CHECKPOINT_LOCAL_DIR_FORMAT, + "deepracer_checkpoints.json") +FINISHED_FILE_LOCAL_PATH_FORMAT = os.path.join(CHECKPOINT_LOCAL_DIR_FORMAT, + SyncFiles.FINISHED.value) +LOCKFILE_LOCAL_PATH_FORMAT = os.path.join(CHECKPOINT_LOCAL_DIR_FORMAT, + SyncFiles.LOCKFILE.value) +TRAINER_READY_LOCAL_PATH_FORMAT = os.path.join(CHECKPOINT_LOCAL_DIR_FORMAT, + SyncFiles.TRAINER_READY.value) + +# SyncFiles local path dict +SYNC_FILES_LOCAL_PATH_FORMAT_DICT = {SyncFiles.FINISHED.value: FINISHED_FILE_LOCAL_PATH_FORMAT, + SyncFiles.LOCKFILE.value: LOCKFILE_LOCAL_PATH_FORMAT, + SyncFiles.TRAINER_READY.value: TRAINER_READY_LOCAL_PATH_FORMAT} + +NUM_MODELS_TO_KEEP = 4 +TEMP_RENAME_FOLDER = "./renamed_checkpoint" +# Temporary folder where the model_{}.pb for best_checkpoint_iteration, last_checkpoint_iteration +# and other iterations > last_checkpoint_iteration are stored +SM_MODEL_PB_TEMP_FOLDER = './frozen_models' diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/__init__.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint.py new file mode 100644 index 00000000..0e2c0097 --- /dev/null +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint.py @@ -0,0 +1,174 @@ +'''This module implements checkpoint file''' + +import os +import io +import logging +import json +import time +import boto3 +import botocore + +from rl_coach.checkpoint import CheckpointStateFile +from rl_coach.data_stores.data_store import SyncFiles +from markov.log_handler.logger import Logger +from markov.log_handler.exception_handler import log_and_exit +from markov.log_handler.constants import (SIMAPP_EVENT_ERROR_CODE_500, + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) +from markov.s3.constants import (CHECKPOINT_POSTFIX_DIR, + COACH_CHECKPOINT_POSTFIX, + DEEPRACER_CHECKPOINT_KEY_POSTFIX, + FINISHED_FILE_KEY_POSTFIX, + LOCKFILE_KEY_POSTFIX, + BEST_CHECKPOINT, + LAST_CHECKPOINT) +from markov.s3.files.checkpoint_files.deepracer_checkpoint_json import DeepracerCheckpointJson +from markov.s3.files.checkpoint_files.rl_coach_checkpoint import RLCoachCheckpoint +from markov.s3.files.checkpoint_files.rl_coach_sync_file import RlCoachSyncFile +from markov.s3.files.checkpoint_files.tensorflow_model import TensorflowModel + +LOG = Logger(__name__, logging.INFO).get_logger() + + +class Checkpoint(): + '''This class is a placeholder for RLCoachCheckpoint, DeepracerCheckpointJson, + RlCoachSyncFile, TensorflowModel to handle all checkpoint related logic + ''' + def __init__(self, bucket, s3_prefix, region_name="us-east-1", + s3_endpoint_url=None, + agent_name='agent', checkpoint_dir="./checkpoint", + max_retry_attempts=5, backoff_time_sec=1.0): + '''This class is a placeholder for RLCoachCheckpoint, DeepracerCheckpointJson, + RlCoachSyncFile, TensorflowModel to handle all checkpoint related logic + + Args: + bucket (str): S3 bucket string + s3_prefix (str): S3 prefix string + region_name (str): S3 region name + agent_name (str): agent name + checkpoint_dir (str): root checkpoint directory + max_retry_attempts (int): maximum number of retry attempts for S3 download/upload + backoff_time_sec (float): backoff second between each retry + ''' + if not bucket or not s3_prefix: + log_and_exit("checkpoint S3 prefix or bucket not available for S3. \ + bucket: {}, prefix {}" + .format(bucket, s3_prefix), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + self._agent_name = agent_name + self._s3_dir = os.path.normpath(os.path.join(s3_prefix, + CHECKPOINT_POSTFIX_DIR)) + + # rl coach checkpoint + self._rl_coach_checkpoint = RLCoachCheckpoint(bucket=bucket, + s3_prefix=s3_prefix, + region_name=region_name, + s3_endpoint_url=s3_endpoint_url, + local_dir=os.path.join(checkpoint_dir, + agent_name), + max_retry_attempts=max_retry_attempts, + backoff_time_sec=backoff_time_sec) + + # deepracer checkpoint json + # do not retry on deepracer checkpoint because initially + # it can do not exist. + self._deepracer_checkpoint_json = \ + DeepracerCheckpointJson(bucket=bucket, + s3_prefix=s3_prefix, + region_name=region_name, + s3_endpoint_url=s3_endpoint_url, + local_dir=os.path.join(checkpoint_dir, agent_name), + max_retry_attempts=0, + backoff_time_sec=backoff_time_sec) + + # rl coach .finished + self._syncfile_finished = RlCoachSyncFile(syncfile_type=SyncFiles.FINISHED.value, + bucket=bucket, + s3_prefix=s3_prefix, + region_name=region_name, + s3_endpoint_url=s3_endpoint_url, + local_dir=os.path.join(checkpoint_dir, + agent_name), + max_retry_attempts=max_retry_attempts, + backoff_time_sec=backoff_time_sec) + + # rl coach .lock: global lock for all agent located at checkpoint directory + self._syncfile_lock = RlCoachSyncFile(syncfile_type=SyncFiles.LOCKFILE.value, + bucket=bucket, + s3_prefix=s3_prefix, + region_name=region_name, + s3_endpoint_url=s3_endpoint_url, + local_dir=checkpoint_dir, + max_retry_attempts=max_retry_attempts, + backoff_time_sec=backoff_time_sec) + + # rl coach .ready + self._syncfile_ready = RlCoachSyncFile(syncfile_type=SyncFiles.TRAINER_READY.value, + bucket=bucket, + s3_prefix=s3_prefix, + region_name=region_name, + s3_endpoint_url=s3_endpoint_url, + local_dir=os.path.join(checkpoint_dir, + agent_name), + max_retry_attempts=max_retry_attempts, + backoff_time_sec=backoff_time_sec) + + # tensorflow .ckpt files + self._tensorflow_model = TensorflowModel(bucket=bucket, + s3_prefix=s3_prefix, + region_name=region_name, + s3_endpoint_url=s3_endpoint_url, + local_dir=os.path.join(checkpoint_dir, + agent_name), + max_retry_attempts=max_retry_attempts, + backoff_time_sec=backoff_time_sec) + + @property + def agent_name(self): + '''return agent name in str + ''' + return self._agent_name + + @property + def s3_dir(self): + '''return s3 directory in str + ''' + return self._s3_dir + + @property + def rl_coach_checkpoint(self): + '''return RLCoachCheckpoint class instance + ''' + return self._rl_coach_checkpoint + + @property + def deepracer_checkpoint_json(self): + '''return DeepracerCheckpointJson class instance + ''' + return self._deepracer_checkpoint_json + + @property + def syncfile_finished(self): + '''return RlCoachSyncFile .finished file class instance + ''' + return self._syncfile_finished + + @property + def syncfile_lock(self): + '''return RlCoachSyncFile .lock file class instance + ''' + return self._syncfile_lock + + @property + def syncfile_ready(self): + '''return RlCoachSyncFile .ready file class instance + ''' + return self._syncfile_ready + + @property + def tensorflow_model(self): + '''return TensorflowModel class instance + ''' + return self._tensorflow_model diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/__init__.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/deepracer_checkpoint_json.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/deepracer_checkpoint_json.py new file mode 100644 index 00000000..c372ae30 --- /dev/null +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/deepracer_checkpoint_json.py @@ -0,0 +1,165 @@ +'''This module implements deepracer checkpoint json file specifically''' + +import os +import logging +import json +import botocore + +from markov.log_handler.logger import Logger +from markov.log_handler.exception_handler import log_and_exit +from markov.log_handler.constants import (SIMAPP_EVENT_ERROR_CODE_500, + SIMAPP_EVENT_ERROR_CODE_400, + SIMAPP_SIMULATION_WORKER_EXCEPTION) +from markov.s3.s3_client import S3Client +from markov.s3.constants import (DEEPRACER_CHECKPOINT_KEY_POSTFIX, + DEEPRACER_CHECKPOINT_LOCAL_PATH_FORMAT, + BEST_CHECKPOINT, + LAST_CHECKPOINT) + +LOG = Logger(__name__, logging.INFO).get_logger() + + +class DeepracerCheckpointJson(): + '''This class is for deepracer checkpoint json file upload and download + ''' + def __init__(self, bucket, s3_prefix, region_name='us-east-1', + s3_endpoint_url=None, + local_dir='.checkpoint/agent', + max_retry_attempts=0, backoff_time_sec=1.0): + '''This class is for deepracer checkpoint json file upload and download + + Args: + bucket (str): S3 bucket string + s3_prefix (str): S3 prefix string + region_name (str): S3 region name + local_dir (str): local file directory + max_retry_attempts (int): maximum number of retry attempts for S3 download/upload + backoff_time_sec (float): backoff second between each retry + ''' + if not bucket or not s3_prefix: + log_and_exit("checkpoint S3 prefix or bucket not available for S3. \ + bucket: {}, prefix {}" + .format(bucket, s3_prefix), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + self._bucket = bucket + # deepracer checkpoint json s3 key + self._s3_key = os.path.normpath(os.path.join( + s3_prefix, + DEEPRACER_CHECKPOINT_KEY_POSTFIX)) + # deepracer checkpoint json local path + self._local_path = os.path.normpath( + DEEPRACER_CHECKPOINT_LOCAL_PATH_FORMAT.format(local_dir)) + self._s3_client = S3Client(region_name, + s3_endpoint_url, + max_retry_attempts, + backoff_time_sec) + + def _get_deepracer_checkpoint(self, checkpoint_type): + '''Returns the deepracer checkpoint stored in the checkpoint json + + Args: + checkpoint_type (str): BEST_CHECKPOINT/LAST_CHECKPOINT string + ''' + try: + # Download deepracer checkpoint + self._download() + except botocore.exceptions.ClientError as err: + if err.response['Error']['Code'] == "404": + LOG.info("Unable to find deepracer checkpoint json") + return None + else: + log_and_exit("Unable to download deepracer checkpoint json: {}, {}". + format(self._bucket, err.response['Error']['Code']), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + except Exception as ex: + log_and_exit("Can't download deepracer checkpoint json: {}".format(ex), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + try: + with open(self._local_path) as deepracer_checkpoint_file: + checkpoint_name = json.load(deepracer_checkpoint_file)[checkpoint_type]["name"] + if not checkpoint_name: + raise Exception("No deepracer checkpoint json recorded") + os.remove(self._local_path) + except Exception as ex: + LOG.info("Unable to parse deepracer checkpoint json: {}".format(ex)) + return None + return checkpoint_name + + def get_deepracer_best_checkpoint(self): + '''get the best deepracer checkpoint name + + Returns: + str: best checkpoint name string + ''' + return self._get_deepracer_checkpoint(BEST_CHECKPOINT) + + def get_deepracer_last_checkpoint(self): + '''get the last deepracer checkpoint name + + Returns: + str: last checkpoint name string + ''' + return self._get_deepracer_checkpoint(LAST_CHECKPOINT) + + def get_deepracer_best_checkpoint_number(self): + '''get the best deepracer checkpoint number. If there is no best checkpoint, + it will return the last checkpoint. If there is no last checkpoint, it will return -1. + + Returns: + int: best checkpoint number in integer + ''' + checkpoint_num = -1 + best_checkpoint_name = self._get_deepracer_checkpoint(BEST_CHECKPOINT) + if best_checkpoint_name and len(best_checkpoint_name.split("_Step")) > 0: + checkpoint_num = int(best_checkpoint_name.split("_Step")[0]) + else: + LOG.info("Unable to find the best deepracer checkpoint number. Getting the last checkpoint number") + checkpoint_num = self.get_deepracer_last_checkpoint_number() + return checkpoint_num + + def get_deepracer_last_checkpoint_number(self): + '''get the last checkpoint number. If there is not last checkpoint, it will return -1 + + Returns: + int: last checkpoint number in integer + ''' + checkpoint_num = -1 + last_checkpoint_name = self._get_deepracer_checkpoint(LAST_CHECKPOINT) + # Verify if the last checkpoint name is present and is in right format + if last_checkpoint_name and len(last_checkpoint_name.split("_Step")) > 0: + checkpoint_num = int(last_checkpoint_name.split("_Step")[0]) + else: + LOG.info("Unable to find the last deepracer checkpoint number.") + return checkpoint_num + + def _download(self): + '''download deepracer checkpoint json file from s3 bucket + ''' + local_dir = os.path.dirname(self._local_path) + if not os.path.exists(local_dir): + os.makedirs(local_dir) + + self._s3_client.download_file(bucket=self._bucket, + s3_key=self._s3_key, + local_path=self._local_path) + LOG.info("[s3] Successfully downloaded deepracer checkpoint json from \ + s3 key {} to local {}.".format(self._s3_key, + self._local_path)) + + def persist(self, body, s3_kms_extra_args): + '''upload metrics into s3 bucket + + Args: + body (str): s3 upload string + s3_kms_extra_args (dict): s3 key management service extra argument + + ''' + self._s3_client.put_object(bucket=self._bucket, + s3_key=self._s3_key, + body=bytes(body, encoding='utf-8'), + s3_kms_extra_args=s3_kms_extra_args) + LOG.info("[s3] Successfully uploaded deepracer checkpoint to \ + s3 bucket {} with s3 key {}.".format(self._bucket, self._s3_key)) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/rl_coach_checkpoint.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/rl_coach_checkpoint.py new file mode 100644 index 00000000..af407a9d --- /dev/null +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/rl_coach_checkpoint.py @@ -0,0 +1,240 @@ +'''This module implements rl coach coach_checkpoint file specifically''' + +import os +import re +import logging +import botocore + +from rl_coach.checkpoint import CheckpointStateFile +from markov.utils import get_s3_kms_extra_args +from markov.log_handler.logger import Logger +from markov.log_handler.exception_handler import log_and_exit +from markov.log_handler.constants import (SIMAPP_EVENT_ERROR_CODE_500, + SIMAPP_EVENT_ERROR_CODE_400, + SIMAPP_SIMULATION_WORKER_EXCEPTION) +from markov.s3.s3_client import S3Client +from markov.s3.constants import (COACH_CHECKPOINT_POSTFIX, + COACH_CHECKPOINT_LOCAL_PATH_FORMAT, + TEMP_COACH_CHECKPOINT_LOCAL_PATH_FORMAT, + OLD_COACH_CHECKPOINT_POSTFIX, + OLD_COACH_CHECKPOINT_LOCAL_PATH_FORMAT, + BEST_CHECKPOINT, + LAST_CHECKPOINT) + +LOG = Logger(__name__, logging.INFO).get_logger() + + +class RLCoachCheckpoint(): + '''This class is for RL coach checkpoint file + ''' + def __init__(self, bucket, s3_prefix, region_name='us-east-1', + s3_endpoint_url=None, + local_dir='./checkpoint/agent', + max_retry_attempts=5, backoff_time_sec=1.0): + '''This class is for RL coach checkpoint file + + Args: + bucket (str): S3 bucket string + s3_prefix (str): S3 prefix string + region_name (str): S3 region name + local_dir (str): local file directory + max_retry_attempts (int): maximum number of retry attempts for S3 download/upload + backoff_time_sec (float): backoff second between each retry + ''' + if not bucket or not s3_prefix: + log_and_exit("checkpoint S3 prefix or bucket not available for S3. \ + bucket: {}, prefix {}" + .format(bucket, s3_prefix), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + self._bucket = bucket + # coach checkpoint s3 key + self._s3_key = os.path.normpath(os.path.join( + s3_prefix, + COACH_CHECKPOINT_POSTFIX)) + # coach checkpoint local path + self._local_path = os.path.normpath( + COACH_CHECKPOINT_LOCAL_PATH_FORMAT.format(local_dir)) + # coach checkpoint local temp path + self._temp_local_path = os.path.normpath( + TEMP_COACH_CHECKPOINT_LOCAL_PATH_FORMAT.format(local_dir)) + # old coach checkpoint s3 key to handle backward compatibility + self._old_s3_key = os.path.normpath(os.path.join( + s3_prefix, + OLD_COACH_CHECKPOINT_POSTFIX)) + # old coach checkpoint local path to handle backward compatibility + self._old_local_path = os.path.normpath( + OLD_COACH_CHECKPOINT_LOCAL_PATH_FORMAT.format(local_dir)) + # coach checkpoint state file from rl coach + self._coach_checkpoint_state_file = CheckpointStateFile( + os.path.dirname(self._local_path)) + self._s3_client = S3Client(region_name, + s3_endpoint_url, + max_retry_attempts, + backoff_time_sec) + + @property + def s3_dir(self): + '''Return s3 directory in string + ''' + return os.path.dirname(self._s3_key) + + @property + def local_path(self): + '''Return local path in string + ''' + return self._local_path + + @property + def coach_checkpoint_state_file(self): + '''Return RL coach CheckpointStateFile class instance + ''' + return self._coach_checkpoint_state_file + + def get(self): + '''get rl coach checkpoint + ''' + self._download() + + def _download(self): + '''download rl coach checkpoint from s3 bucket + ''' + local_dir = os.path.dirname(self._local_path) + if not os.path.exists(local_dir): + os.makedirs(local_dir) + self._s3_client.download_file(bucket=self._bucket, + s3_key=self._s3_key, + local_path=self._local_path) + LOG.info("[s3] Successfully downloaded rl coach checkpoint from \ + s3 key {} to local {}.".format(self._s3_key, + self._local_path)) + + def persist(self, s3_kms_extra_args): + '''upload rl coach checkpoint to s3 bucket + + Args: + s3_kms_extra_args (dict): s3 key management service extra argument + ''' + self._s3_client.upload_file(bucket=self._bucket, + s3_key=self._s3_key, + local_path=self._local_path, + s3_kms_extra_args=s3_kms_extra_args) + LOG.info("[s3] Successfully uploaded coach checkpoint to \ + s3 bucket {} with s3 key {}.".format(self._bucket, self._s3_key)) + + def _persist_temp_coach_checkpoint(self, s3_kms_extra_args): + '''upload rl temp coach checkpoint to s3 bucket for tensorflow model selection + and compatibility + + Args: + s3_kms_extra_args (dict): s3 key management service extra argument + ''' + self._s3_client.upload_file(bucket=self._bucket, + s3_key=self._s3_key, + local_path=self._temp_local_path, + s3_kms_extra_args=s3_kms_extra_args) + LOG.info("[s3] Successfully uploaded temp coach checkpoint to \ + s3 bucket {} with s3 key {}.".format(self._bucket, self._s3_key)) + + def update(self, model_checkpoint_name, s3_kms_extra_args): + '''update local coach checkpoint file and upload to s3 bucket + + Args: + model_checkpoint_name (str): model checkpoint string + s3_kms_extra_args (dict): s3 key management service extra argument + + Returns: + bool: True if update rl coach checkpoint successfully, False, otherwise. + This is mainly for validation worker to validate the model. + ''' + try: + # check model checkpoint is present and is type string + if model_checkpoint_name is None or not isinstance(model_checkpoint_name, str): + LOG.info("Exit because model_checkpoint_name is {} of type {}". + format(model_checkpoint_name, type(model_checkpoint_name))) + return False + with open(self._temp_local_path, '+w') as new_ckpnt: + new_ckpnt.write(model_checkpoint_name) + # upload local temp rl coach checkpoint + self._persist_temp_coach_checkpoint(s3_kms_extra_args=s3_kms_extra_args) + # remove local temp rl coach checkpoint + os.remove(self._temp_local_path) + return True + except botocore.exceptions.ClientError as err: + log_and_exit("Unable to upload checkpoint: {}, {}" + .format(self._bucket, err.response['Error']['Code']), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + except Exception as ex: + log_and_exit("Exception in uploading checkpoint: {}".format(ex), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + + def is_compatible(self): + '''check whether rl coach checkpoint is compatiable by checking + whether there is a .coach_checkpoint file presetn in the expected s3 bucket + + Returns: + bool: True is coach checkpoint is compatiable, False otherwise + ''' + try: + coach_checkpoint_dir, coach_checkpoint_filename = \ + os.path.split(self._s3_key) + response = self._s3_client.list_objects_v2(bucket=self._bucket, + prefix=coach_checkpoint_dir) + if 'Contents' not in response: + # Customer deleted checkpoint file. + log_and_exit("No objects found: {}".format(self._bucket), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + + return any(list(map(lambda obj: os.path.split(obj['Key'])[1] == coach_checkpoint_filename, + response['Contents']))) + except botocore.exceptions.ClientError as e: + log_and_exit("No objects found: {}, {}" + .format(self._bucket, e.response['Error']['Code']), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + except Exception as e: + log_and_exit("Exception in checking for current checkpoint key: {}".format(e), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + + def make_compatible(self, syncfile_ready): + '''update coach checkpoint file to make it compatible + + Args: + syncfile_ready (RlCoachSyncFile): RlCoachSyncFile class instance for .ready file + ''' + try: + # download old coach checkpoint + self._s3_client.download_file(bucket=self._bucket, + s3_key=self._old_s3_key, + local_path=self._old_local_path) + # parse old coach checkpoint + with open(self._old_local_path) as old_coach_checkpoint_file: + coach_checkpoint_value = re.findall(r'"(.*?)"', old_coach_checkpoint_file.readline()) + if len(coach_checkpoint_value) != 1: + log_and_exit("No checkpoint file found", + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + # remove old local coach checkpoint + os.remove(self._old_local_path) + # Upload ready file so that the system can gab the checkpoints + syncfile_ready.persist(s3_kms_extra_args=get_s3_kms_extra_args()) + # write new temp coach checkpoint file + with open(self._temp_local_path, 'w+') as new_coach_checkpoint_file: + new_coach_checkpoint_file.write(coach_checkpoint_value[0]) + # upload new temp coach checkpoint file + self._persist_temp_coach_checkpoint(s3_kms_extra_args=get_s3_kms_extra_args()) + # remove new temp local coach checkpoint + os.remove(self._temp_local_path) + except botocore.exceptions.ClientError as e: + log_and_exit("Unable to make model compatible: {}, {}". + format(self._bucket, e.response['Error']['Code']), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + except Exception as e: + log_and_exit("Exception in making model compatible: {}".format(e), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/rl_coach_sync_file.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/rl_coach_sync_file.py new file mode 100644 index 00000000..6b714fdd --- /dev/null +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/rl_coach_sync_file.py @@ -0,0 +1,125 @@ +'''This module implements rl coach sync file specifically''' + +import os +import io +import botocore +import logging + +from markov.log_handler.logger import Logger +from markov.log_handler.exception_handler import log_and_exit +from markov.log_handler.constants import (SIMAPP_EVENT_ERROR_CODE_500, + SIMAPP_EVENT_ERROR_CODE_400, + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_S3_DATA_STORE_EXCEPTION) +from markov.s3.s3_client import S3Client +from markov.s3.constants import (SYNC_FILES_POSTFIX_DICT, + SYNC_FILES_LOCAL_PATH_FORMAT_DICT) + +LOG = Logger(__name__, logging.INFO).get_logger() + + +class RlCoachSyncFile(): + '''This class is for rl coach sync file: .finished, .lock, and .ready + ''' + def __init__(self, syncfile_type, bucket, s3_prefix, region_name="us-east-1", + s3_endpoint_url=None, + local_dir='./checkpoint', + max_retry_attempts=5, backoff_time_sec=1.0): + '''This class is for rl coach sync file: .finished, .lock, and .ready + + Args: + syncfile_type (str): sync file type + bucket (str): S3 bucket string + s3_prefix (str): S3 prefix string + local_dir (str): local file directory + checkpoint_dir (str): checkpoint directory + max_retry_attempts (int): maximum number of retry attempts for S3 download/upload + backoff_time_sec (float): backoff second between each retry + ''' + if not bucket or not s3_prefix: + log_and_exit("checkpoint S3 prefix or bucket not available for S3. \ + bucket: {}, prefix {}" + .format(bucket, s3_prefix), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + self._syncfile_type = syncfile_type + self._bucket = bucket + # deepracer checkpoint json s3 key + self._s3_key = os.path.normpath(os.path.join( + s3_prefix, + SYNC_FILES_POSTFIX_DICT[syncfile_type])) + # deepracer checkpoint json local path + self._local_path = os.path.normpath( + SYNC_FILES_LOCAL_PATH_FORMAT_DICT[syncfile_type].format(local_dir)) + self._s3_client = S3Client(region_name, + s3_endpoint_url, + max_retry_attempts, + backoff_time_sec) + + @property + def local_path(self): + '''Return local path in string + ''' + return self._local_path + + def list(self): + '''List sync file + ''' + return self._s3_client.list_objects_v2(bucket=self._bucket, + prefix=self._s3_key) + + def persist(self, s3_kms_extra_args): + '''persist sync file into s3 bucket by writing it locally first + + Args: + s3_kms_extra_args (dict): s3 key management service extra argument + ''' + try: + # make local dir is missing + local_dir = os.path.dirname(self._local_path) + if not os.path.exists(local_dir): + os.makedirs(local_dir) + + # persist to s3 + self._s3_client.upload_fileobj(bucket=self._bucket, + s3_key=self._s3_key, + fileobj=io.BytesIO(b''), + s3_kms_extra_args=s3_kms_extra_args) + LOG.info("[s3] Successfully uploaded {} to \ + s3 bucket {} with s3 key {}.".format(self._syncfile_type, + self._bucket, + self._s3_key)) + except botocore.exceptions.ClientError: + log_and_exit("Unable to upload {} file".format(self._syncfile_type), + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + except Exception as ex: + log_and_exit("Exception in uploading {} file {}".format(self._syncfile_type, ex), + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + + def download(self): + '''download the sync file from s3 bucket + ''' + self._download() + + def delete(self): + '''delete the sync file from s3 bucket + ''' + self._s3_client.delete_object(bucket=self._bucket, + s3_key=self._s3_key) + + def _download(self): + '''download file from s3 bucket + ''' + local_dir = os.path.dirname(self._local_path) + if not os.path.exists(local_dir): + os.makedirs(local_dir) + + self._s3_client.download_file(bucket=self._bucket, + s3_key=self._s3_key, + local_path=self._local_path) + LOG.info("[s3] Successfully downloaded {} from \ + s3 key {} to local {}.".format(self._syncfile_type, + self._s3_key, + self._local_path)) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/tensorflow_model.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/tensorflow_model.py new file mode 100644 index 00000000..9a8fd5c7 --- /dev/null +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/checkpoint_files/tensorflow_model.py @@ -0,0 +1,416 @@ +'''This module implements tf model ckpt and pb file specifically''' + +import os +import re +import shutil +import queue +import logging + +import tensorflow as tf +from rl_coach.checkpoint import CheckpointStateFile, _filter_checkpoint_files, SingleCheckpoint +from markov import utils +from markov.log_handler.logger import Logger +from markov.log_handler.exception_handler import log_and_exit +from markov.log_handler.constants import (SIMAPP_EVENT_ERROR_CODE_500, + SIMAPP_EVENT_ERROR_CODE_400, + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_S3_DATA_STORE_EXCEPTION) +from markov.s3.constants import (CHECKPOINT_LOCAL_DIR_FORMAT, + CHECKPOINT_POSTFIX_DIR, + NUM_MODELS_TO_KEEP, + TEMP_RENAME_FOLDER, + SM_MODEL_PB_TEMP_FOLDER) +from markov.s3.s3_client import S3Client + +LOG = Logger(__name__, logging.INFO).get_logger() + +SM_MODEL_OUTPUT_DIR = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") + +if not os.path.exists(TEMP_RENAME_FOLDER): + os.makedirs(TEMP_RENAME_FOLDER) + + +class TensorflowModel(): + '''This class is for tensorflow model upload and download + ''' + def __init__(self, bucket, s3_prefix, region_name='us-east-1', + s3_endpoint_url=None, + local_dir='./checkpoint/agent', + max_retry_attempts=5, backoff_time_sec=1.0): + '''This class is for tensorflow model upload and download + + Args: + bucket (str): S3 bucket string + s3_prefix (str): S3 prefix string + region_name (str): S3 region name + local_dir (str): local file directory + max_retry_attempts (int): maximum number of retry attempts for S3 download/upload + backoff_time_sec (float): backoff second between each retry + ''' + if not bucket or not s3_prefix: + log_and_exit("checkpoint S3 prefix or bucket not available for S3. \ + bucket: {}, prefix {}" + .format(bucket, s3_prefix), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + self._bucket = bucket + self._local_dir = os.path.normpath( + CHECKPOINT_LOCAL_DIR_FORMAT.format(local_dir)) + self._s3_key_dir = os.path.normpath(os.path.join(s3_prefix, + CHECKPOINT_POSTFIX_DIR)) + self._delete_queue = queue.Queue() + self._s3_client = S3Client(region_name, + s3_endpoint_url, + max_retry_attempts, + backoff_time_sec) + + def _download(self, s3_key, local_path): + '''download files from s3 bucket + + Args: + s3_key (str): S3 key string + local_path (str): local path string + ''' + local_dir = os.path.dirname(local_path) + if not os.path.exists(local_dir): + os.makedirs(local_dir) + + self._s3_client.download_file(bucket=self._bucket, + s3_key=s3_key, + local_path=local_path) + + _, file_name = os.path.split(local_path) + LOG.info("[s3] Successfully downloaded {} from \ + s3 key {} to local {}.".format(file_name, + s3_key, + local_path)) + + def get(self, coach_checkpoint_state_file): + '''get tensorflow model specified in the rl coach checkpoint state file + If the rl coach checkpoint state file specified checkpoint is missing. It will + download last checkpoints and over write the last in local rl coach checkpoint state file + + Args: + coach_checkpoint_state_file (CheckpointStateFile): CheckpointStateFile instance + ''' + has_checkpoint = False + last_checkpoint_number = -1 + last_checkpoint_name = None + # list everything in tensorflow model s3 bucket dir + # to find the checkpoint specified in .coach_checkpoint + # or use the last + checkpoint_name = str(coach_checkpoint_state_file.read()) + for page in self._s3_client.paginate(bucket=self._bucket, prefix=self._s3_key_dir): + if "Contents" in page: + # Check to see if the desired tensorflow model is in the bucket + # for example if obj is (dir)/487_Step-2477372.ckpt.data-00000-of-00001 + # curr_checkpoint_number: 487 + # curr_checkpoint_name: 487_Step-2477372.ckpt.data-00000-of-00001 + for obj in page['Contents']: + curr_checkpoint_name = os.path.split(obj['Key'])[1] + # if found the checkpoint name stored in .coach_checkpoint file + # break inner loop for file search + if curr_checkpoint_name.startswith(checkpoint_name): + has_checkpoint = True + break + # if the file name does not start with a number (not ckpt file) + # continue for next file + if not utils.is_int_repr(curr_checkpoint_name.split("_")[0]): + continue + # if the file name start with a number, update the last checkpoint name + # and number + curr_checkpoint_number = int(curr_checkpoint_name.split("_")[0]) + if curr_checkpoint_number > last_checkpoint_number: + last_checkpoint_number = curr_checkpoint_number + last_checkpoint_name = curr_checkpoint_name.rsplit('.', 1)[0] + # break out from pagination if find the checkpoint + if has_checkpoint: + break + + # update checkpoint_name to the last_checkpoint_name and overwrite local + # .coach_checkpoint file to contain the last checkpoint + if not has_checkpoint: + if last_checkpoint_name: + coach_checkpoint_state_file.write(SingleCheckpoint( + num=last_checkpoint_number, + name=last_checkpoint_name)) + LOG.info("%s not in s3 bucket, downloading %s checkpoints", checkpoint_name, last_checkpoint_name) + checkpoint_name = last_checkpoint_name + else: + log_and_exit("No checkpoint files", + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + + # download the desired checkpoint file + for page in self._s3_client.paginate(bucket=self._bucket, prefix=self._s3_key_dir): + if "Contents" in page: + for obj in page['Contents']: + s3_key = obj["Key"] + _, file_name = os.path.split(s3_key) + local_path = os.path.normpath(os.path.join(self._local_dir, + file_name)) + _, file_extension = os.path.splitext(s3_key) + if file_extension != '.pb' and file_name.startswith(checkpoint_name): + self._download(s3_key=s3_key, local_path=local_path) + + def persist(self, coach_checkpoint_state_file, s3_kms_extra_args): + '''upload tensorflow model specified in rl coach checkpoint state file into the s3 bucket + + Args: + coach_checkpoint_state_file (CheckpointStateFile): CheckpointStateFile instance + s3_kms_extra_args (dict): s3 key management service extra argument + ''' + ckpt_state = None + check_point_key_list = [] + if coach_checkpoint_state_file.exists(): + ckpt_state = coach_checkpoint_state_file.read() + checkpoint_file = None + num_files_uploaded = 0 + for root, _, files in os.walk(self._local_dir): + for filename in files: + if filename == CheckpointStateFile.checkpoint_state_filename: + checkpoint_file = (root, filename) + continue + if filename.startswith(ckpt_state.name): + abs_name = os.path.abspath(os.path.join(root, filename)) + rel_name = os.path.relpath(abs_name, self._local_dir) + self._s3_client.upload_file(bucket=self._bucket, + s3_key=os.path.normpath( + os.path.join(self._s3_key_dir, + rel_name)), + local_path=abs_name, + s3_kms_extra_args=s3_kms_extra_args) + check_point_key_list.append(os.path.normpath(os.path.join(self._s3_key_dir, + rel_name))) + num_files_uploaded += 1 + LOG.info("Uploaded %s files for checkpoint %s", num_files_uploaded, ckpt_state.num) + if check_point_key_list: + self._delete_queue.put(check_point_key_list) + + def rename(self, coach_checkpoint_state_file, agent_name): + '''rename the tensorflow model specified in the rl coach checkpoint state file to include + agent name + + Args: + coach_checkpoint_state_file (CheckpointStateFile): CheckpointStateFile instance + agent_name (str): agent name + ''' + try: + LOG.info("Renaming checkpoint from checkpoint_dir: {} for agent: {}" + .format(self._local_dir, agent_name)) + checkpoint_name = str(coach_checkpoint_state_file.read()) + tf_checkpoint_file = os.path.join(self._local_dir, "checkpoint") + with open(tf_checkpoint_file, "w") as outfile: + outfile.write("model_checkpoint_path: \"{}\"".format(checkpoint_name)) + + with tf.Session() as sess: + for var_name, _ in tf.contrib.framework.list_variables(self._local_dir): + # Load the variable + var = tf.contrib.framework.load_variable(self._local_dir, var_name) + new_name = var_name + # Set the new name + # Replace agent/ or agent_#/ with {agent_name}/ + new_name = re.sub('agent/|agent_\d+/', '{}/'.format(agent_name), new_name) + # Rename the variable + var = tf.Variable(var, name=new_name) + saver = tf.train.Saver() + sess.run(tf.global_variables_initializer()) + renamed_checkpoint_path = os.path.join(TEMP_RENAME_FOLDER, checkpoint_name) + LOG.info('Saving updated checkpoint to {}'.format(renamed_checkpoint_path)) + saver.save(sess, renamed_checkpoint_path) + # Remove the tensorflow 'checkpoint' file + os.remove(tf_checkpoint_file) + # Remove the old checkpoint from the checkpoint dir + for file_name in os.listdir(self._local_dir): + if checkpoint_name in file_name: + os.remove(os.path.join(self._local_dir, file_name)) + # Copy the new checkpoint with renamed variable to the checkpoint dir + for file_name in os.listdir(TEMP_RENAME_FOLDER): + full_file_name = os.path.join(os.path.abspath(TEMP_RENAME_FOLDER), file_name) + if os.path.isfile(full_file_name) and file_name != "checkpoint": + shutil.copy(full_file_name, self._local_dir) + # Remove files from temp_rename_folder + shutil.rmtree(TEMP_RENAME_FOLDER) + tf.reset_default_graph() + # If either of the checkpoint files (index, meta or data) not found + except tf.errors.NotFoundError as err: + log_and_exit("No checkpoint found: {}".format(err), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + # Thrown when user modifies model, checkpoints get corrupted/truncated + except tf.errors.DataLossError as err: + log_and_exit("User modified ckpt, unrecoverable dataloss or corruption: {}" + .format(err), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + except tf.errors.OutOfRangeError as err: + log_and_exit("User modified ckpt: {}" + .format(err), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + except ValueError as err: + if utils.is_user_error(err): + log_and_exit("Couldn't find 'checkpoint' file or checkpoints in given \ + directory ./checkpoint: {}".format(err), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + else: + log_and_exit("ValueError in rename checkpoint: {}".format(err), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + except Exception as ex: + log_and_exit("Exception in rename checkpoint: {}".format(ex), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + + def persist_tensorflow_frozen_graph(self, agent_name, graph_manager, coach_checkpoint_state_file, + best_checkpoint_number, last_checkpoint_number, + s3_kms_extra_args): + '''persist the tensorflow frozen graph specified by rl coach checkpoint state file into s3 + + Args: + agent_name (str): agent name + graph_manager (MultiAgentGraphManager): MultiAgentGraphManager class instance + coach_checkpoint_state_file (CheckpointStateFile): CheckpointStateFile class instance + best_checkpoint_number (int): best checkpoint number + last_checkpoint_number (int): last checkpoint number + s3_kms_extra_args (dict): s3 key management service extra argument + ''' + # checkpoint state is always present for the checkpoint dir passed. + # We make same assumption while we get the best checkpoint in s3_metrics + checkpoint_num = coach_checkpoint_state_file.read().num + self.write_frozen_graph(graph_manager.sess, agent_name, checkpoint_num) + frozen_name = "model_{}.pb".format(checkpoint_num) + frozen_graph_local_path = os.path.join(SM_MODEL_PB_TEMP_FOLDER, + agent_name, + frozen_name) + # upload the model_.pb to S3. + self._s3_client.upload_file(bucket=self._bucket, + s3_key=os.path.normpath( + os.path.join(self._s3_key_dir, + frozen_name)), + local_path=frozen_graph_local_path, + s3_kms_extra_args=s3_kms_extra_args) + + LOG.info("saved intermediate frozen graph: %s", + os.path.normpath(os.path.join(self._s3_key_dir, frozen_name))) + + # Copy the best checkpoint to the SM_MODEL_OUTPUT_DIR + self.copy_best_frozen_graph_to_sm_output_dir( + best_checkpoint_number=best_checkpoint_number, + last_checkpoint_number=last_checkpoint_number, + source_dir=os.path.join(SM_MODEL_PB_TEMP_FOLDER, agent_name), + dest_dir=os.path.join(SM_MODEL_OUTPUT_DIR, agent_name)) + + def copy_best_frozen_graph_to_sm_output_dir(self, best_checkpoint_number, last_checkpoint_number, + source_dir, dest_dir): + """Copy the frozen model for the current best checkpoint from soure directory to the destination directory. + + Args: + s3_bucket (str): S3 bucket where the deepracer_checkpoints.json is stored + s3_prefix (str): S3 prefix where the deepracer_checkpoints.json is stored + region (str): AWS region where the deepracer_checkpoints.json is stored + source_dir (str): Source directory where the frozen models are present + dest_dir (str): Sagemaker output directory where we store the frozen models for best checkpoint + """ + dest_dir_pb_files = [filename for filename in os.listdir(dest_dir) + if os.path.isfile(os.path.join(dest_dir, filename)) and filename.endswith(".pb")] + source_dir_pb_files = [filename for filename in os.listdir(source_dir) + if os.path.isfile(os.path.join(source_dir, filename)) and filename.endswith(".pb")] + + LOG.info("Best checkpoint number: {}, Last checkpoint number: {}". + format(best_checkpoint_number, last_checkpoint_number)) + best_model_name = 'model_{}.pb'.format(best_checkpoint_number) + last_model_name = 'model_{}.pb'.format(last_checkpoint_number) + if len(source_dir_pb_files) < 1: + log_and_exit("Could not find any frozen model file in the local directory", + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + try: + # Could not find the deepracer_checkpoints.json file or there are no model.pb files in destination + if best_checkpoint_number == -1 or len(dest_dir_pb_files) == 0: + if len(source_dir_pb_files) > 1: + LOG.info("More than one model.pb found in the source directory. Choosing the " + "first one to copy to destination: {}".format(source_dir_pb_files[0])) + # copy the frozen model present in the source directory + LOG.info("Copying the frozen checkpoint from {} to {}.".format( + os.path.join(source_dir, source_dir_pb_files[0]), os.path.join(dest_dir, "model.pb"))) + shutil.copy(os.path.join(source_dir, source_dir_pb_files[0]), os.path.join(dest_dir, "model.pb")) + else: + # Delete the current .pb files in the destination direcory + for filename in dest_dir_pb_files: + os.remove(os.path.join(dest_dir, filename)) + + # Copy the frozen model for the current best checkpoint to the destination directory + LOG.info("Copying the frozen checkpoint from {} to {}.".format( + os.path.join(source_dir, best_model_name), os.path.join(dest_dir, "model.pb"))) + shutil.copy(os.path.join(source_dir, best_model_name), os.path.join(dest_dir, "model.pb")) + + # Loop through the current list of frozen models in source directory and + # delete the iterations lower than last_checkpoint_iteration except best_model + for filename in source_dir_pb_files: + if filename not in [best_model_name, last_model_name]: + if len(filename.split("_")[1]) > 1 and len(filename.split("_")[1].split(".pb")): + file_iteration = int(filename.split("_")[1].split(".pb")[0]) + if file_iteration < last_checkpoint_number: + os.remove(os.path.join(source_dir, filename)) + else: + LOG.error("Frozen model name not in the right format in the source directory: {}, {}". + format(filename, source_dir)) + except FileNotFoundError as err: + log_and_exit("No such file or directory: {}".format(err), + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + + def write_frozen_graph(self, sess, agent_name, iteration_id): + """Write the frozen graph to the temporary folder with a name model_{}.pb for the iteration_id passed + + Args: + sess (dict): key as agent name and value as agent params + agent_name (str): Name of the agent + iteration_id (int): Iteration id for which we are saving the model_{}.pb + """ + if not os.path.exists(os.path.join(SM_MODEL_PB_TEMP_FOLDER, agent_name)): + os.makedirs(os.path.join(SM_MODEL_PB_TEMP_FOLDER, agent_name)) + if not os.path.exists(os.path.join(SM_MODEL_OUTPUT_DIR, agent_name)): + os.makedirs(os.path.join(SM_MODEL_OUTPUT_DIR, agent_name)) + output_head = ['main_level/{}/main/online/network_1/ppo_head_0/policy'.format(agent_name)] + frozen = tf.graph_util.convert_variables_to_constants(sess[agent_name], + sess[agent_name].graph_def, + output_head) + tf.train.write_graph(frozen, os.path.join(SM_MODEL_PB_TEMP_FOLDER, agent_name), + 'model_{}.pb'.format(iteration_id), as_text=False) + + def delete(self, coach_checkpoint_state_file, best_checkpoint): + '''delete tensorflow models from s3 bucket + + Args: + coach_checkpoint_state_file (CheckpointStateFile): CheckpointStateFile class instance + best_checkpoint (str): best checkpoing string + ''' + if coach_checkpoint_state_file.read() and \ + self._delete_queue.qsize() > NUM_MODELS_TO_KEEP: + while self._delete_queue.qsize() > NUM_MODELS_TO_KEEP: + key_list = self._delete_queue.get() + if best_checkpoint and all(list(map(lambda file_name: best_checkpoint in file_name, + [os.path.split(file)[-1] for file in key_list]))): + self._delete_queue.put(key_list) + else: + delete_iteration_ids = set() + for key in key_list: + self._s3_client.delete_object(bucket=self._bucket, + s3_key=key) + # Get the name of the file in the checkpoint directory that has to be deleted + # and extract the iteration id out of the name + file_in_checkpoint_dir = os.path.split(key)[-1] + if len(file_in_checkpoint_dir.split("_Step")) > 0: + delete_iteration_ids.add(file_in_checkpoint_dir.split("_Step")[0]) + LOG.info("Deleting the frozen models in s3 for the iterations: %s", + delete_iteration_ids) + # Delete the model_{}.pb files from the s3 bucket for the previous iterations + for iteration_id in list(delete_iteration_ids): + frozen_name = "model_{}.pb".format(iteration_id) + self._s3_client.delete_object(bucket=self._bucket, + s3_key=os.path.normpath( + os.path.join(self._s3_key_dir, + frozen_name)),) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/hyperparameters.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/hyperparameters.py index cae50964..9448218f 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/hyperparameters.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/hyperparameters.py @@ -76,7 +76,7 @@ def persist(self, hyperparams_json, s3_kms_extra_args): s3_key=self._s3_key, fileobj=io.BytesIO(hyperparams_json.encode()), s3_kms_extra_args=s3_kms_extra_args) - LOG.info("[s3] Successfully upload hyperparameters to \ + LOG.info("[s3] Successfully uploaded hyperparameters to \ s3 bucket {} with s3 key {}.".format(self._bucket, self._s3_key)) def _download(self): diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/ip_config.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/ip_config.py index eee13165..3742175b 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/ip_config.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/ip_config.py @@ -54,9 +54,9 @@ def __init__(self, bucket, s3_prefix, region_name='us-east-1', IP_ADDRESS_POSTFIX)) self._local_path = local_path self._s3_client = S3Client(region_name, + s3_endpoint_url, max_retry_attempts, - backoff_time_sec - s3_endpoint_url=s3_endpoint_url) + backoff_time_sec) self._ip_file = None def get_ip_config(self): @@ -86,7 +86,7 @@ def persist(self, s3_kms_extra_args): s3_key=self._s3_ip_address_key, fileobj=io.BytesIO(ip_address_json.encode()), s3_kms_extra_args=s3_kms_extra_args) - LOG.info("[s3] Successfully upload ip address to \ + LOG.info("[s3] Successfully uploaded ip address to \ s3 bucket {} with s3 key {}.".format(self._bucket, self._s3_ip_address_key)) # persist done second # if retry failed, s3_client upload_fileobj will log and exit 500 @@ -94,7 +94,7 @@ def persist(self, s3_kms_extra_args): s3_key=self._s3_ip_done_key, fileobj=io.BytesIO(b'done'), s3_kms_extra_args=s3_kms_extra_args) - LOG.info("[s3] Successfully upload ip done to \ + LOG.info("[s3] Successfully uploaded ip done to \ s3 bucket {} with s3 key {}.".format(self._bucket, self._s3_ip_done_key)) def _download(self): @@ -126,9 +126,9 @@ def _wait_for_ip_config(self): '''wait for ip config to be ready''' time_elapsed = 0 while time_elapsed < SAGEMAKER_WAIT_TIME: - # if retry failed, s3_client list_objects will log and exit 500 - response = self._s3_client.list_objects(bucket=self._bucket, - prefix=self._s3_ip_done_key) + # if retry failed, s3_client list_objects_v2 will log and exit 500 + response = self._s3_client.list_objects_v2(bucket=self._bucket, + prefix=self._s3_ip_done_key) if "Contents" in response: break time.sleep(1) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/metrics.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/metrics.py index 4e02cc46..885709d6 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/metrics.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/metrics.py @@ -32,8 +32,9 @@ def __init__(self, bucket, s3_key, region_name='us-east-1', self._bucket = bucket self._s3_key = s3_key self._s3_client = S3Client(region_name, + s3_endpoint_url, max_retry_attempts, - backoff_time_sec,s3_endpoint_url=s3_endpoint_url) + backoff_time_sec) def persist(self, body, s3_kms_extra_args): '''upload metrics into s3 bucket diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/simtrace_video.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/simtrace_video.py index de36ceee..8413f696 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/simtrace_video.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/files/simtrace_video.py @@ -37,9 +37,9 @@ def __init__(self, upload_type, bucket, s3_prefix, self._local_path = local_path self._upload_num = 0 self._s3_client = S3Client(region_name, + s3_endpoint_url, max_retry_attempts, - backoff_time_sec, - s3_endpoint_url=s3_endpoint_url) + backoff_time_sec) def persist(self, s3_kms_extra_args): '''persist simtrace or video into s3 bucket diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/s3_client.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/s3_client.py index c0cd5631..b73b66c7 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/s3_client.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3/s3_client.py @@ -16,11 +16,11 @@ HYPERPARAMETERS, SAGEMAKER_S3_KMS_CMK_ARN, ROBOMAKER_S3_KMS_CMK_ARN, S3KmsEncryption) -S3_ERROR_MSG_FORMAT = "S3 failed, retry after {0} seconds. \ - Re-try count: {1}/{2}: [S3_Bucket: {3}, S3_Key: {4}]: {5}" +S3_ERROR_MSG_FORMAT = "S3 failed, retry after {0} seconds. Re-try count: {1}/{2}: {3}" logger = Logger(__name__, logging.INFO).get_logger() + class S3Client(): def __init__(self, region_name="us-east-1", s3_endpoint_url=None, max_retry_attempts=5, backoff_time_sec=1.0): '''S3 client @@ -76,8 +76,6 @@ def _exp_backoff(self, action_method, **kwargs): error_message = S3_ERROR_MSG_FORMAT.format(backoff_time, str(try_count), str(self._max_retry_attempts), - kwargs['Bucket'], - kwargs['Key'], e) logger.info(error_message) time.sleep(backoff_time) @@ -165,8 +163,8 @@ def upload_fileobj(self, bucket, s3_key, fileobj, s3_kms_extra_args): SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) - def list_objects(self, bucket, prefix): - '''list object from s3 with retry logic + def list_objects_v2(self, bucket, prefix): + '''list object v2 from s3 with retry logic Args: bucket (str): s3 bucket @@ -175,19 +173,18 @@ def list_objects(self, bucket, prefix): ''' try: - return self._exp_backoff(action_method=self._get_s3_client().list_objects, + return self._exp_backoff(action_method=self._get_s3_client().list_objects_v2, Bucket=bucket, Prefix=prefix) except botocore.exceptions.ClientError: log_and_exit("Unable to list objects", SIMAPP_S3_DATA_STORE_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) + SIMAPP_EVENT_ERROR_CODE_400) except Exception as ex: log_and_exit("Exception in listing objects: {}".format(ex), SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) - def put_object(self, bucket, s3_key, body, s3_kms_extra_args): '''put object into s3 with retry logic @@ -213,3 +210,51 @@ def put_object(self, bucket, s3_key, body, s3_kms_extra_args): log_and_exit("Exception in putting objects: {}".format(ex), SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) + + def delete_object(self, bucket, s3_key): + '''delete files specified by s3_key from s3 bucket + + Args: + bucket (str): s3 bucket + s3_key (str): s3 key + ''' + try: + self._exp_backoff(action_method=self._get_s3_client().delete_object, + Bucket=bucket, + Key=s3_key) + except botocore.exceptions.ClientError as err: + log_and_exit("Unable to delete object from s3: bucket: {}, error: {}" + .format(bucket, err.response['Error']['Code']), + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + except Exception as ex: + log_and_exit("Unable to delete object from s3, exception: {}".format(ex), + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + + def paginate(self, bucket, prefix): + '''get paginator for list_objects_v2 + + Args: + bucket (str): s3 bucket + s3_key (str): s3 key + + Returns: + iter: page iterator + ''' + try: + # get a paginator for list_objects_v2 + kwargs = {'Bucket': bucket, + 'Prefix': prefix} + paginator = self._get_s3_client().get_paginator("list_objects_v2") + # paginate based on the kwargs + return paginator.paginate(**kwargs) + except botocore.exceptions.ClientError as err: + log_and_exit("Unable to paginate from s3, error: {}" + .format(err.response['Error']['Code']), + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + except Exception as ex: + log_and_exit("Unable to paginate from s3, exception: {}".format(ex), + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3_boto_data_store.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3_boto_data_store.py index 86a40d79..a1aaa3b2 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3_boto_data_store.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3_boto_data_store.py @@ -10,37 +10,23 @@ from rl_coach.checkpoint import CheckpointStateFile, _filter_checkpoint_files from rl_coach.data_stores.data_store import DataStore, DataStoreParameters, SyncFiles from markov.multi_agent_coach.multi_agent_graph_manager import MultiAgentGraphManager -from markov.utils import get_best_checkpoint, get_boto_config, \ - copy_best_frozen_model_to_sm_output_dir, get_s3_kms_extra_args +from markov.utils import get_s3_kms_extra_args from markov.log_handler.logger import Logger from markov.log_handler.exception_handler import log_and_exit from markov.log_handler.constants import (SIMAPP_EVENT_ERROR_CODE_500, SIMAPP_EVENT_ERROR_CODE_400, - SIMAPP_S3_DATA_STORE_EXCEPTION) + SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_SIMULATION_WORKER_EXCEPTION) +from markov.s3.files.checkpoint import Checkpoint import tensorflow as tf LOG = Logger(__name__, logging.INFO).get_logger() -# The number of models to keep in S3 -#! TODO discuss with product team if this number should be configurable -NUM_MODELS_TO_KEEP = 4 - SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND = 1 -SM_MODEL_OUTPUT_DIR = os.environ.get("ALGO_MODEL_DIR", "/opt/ml/model") -# Temporary folder where the model_{}.pb for best_checkpoint_iteration, last_checkpoint_iteration -# and other iterations > last_checkpoint_iteration are stored -SM_MODEL_PB_TEMP_FOLDER = './frozen_models' class S3BotoDataStoreParameters(DataStoreParameters): - def __init__(self, aws_region: str = "us-west-2", bucket_names: Dict[str, str] = {"agent": None}, - s3_endpoint_url: str = None, s3_folders: Dict[str, str] = {"agent": None}, - base_checkpoint_dir: str = None): + def __init__(self, checkpoint_dict: Dict[str, Checkpoint]): super().__init__("s3", "", "") - self.aws_region = aws_region - self.buckets = bucket_names - self.s3_folders = s3_folders - self.base_checkpoint_dir = base_checkpoint_dir - self.s3_endpoint_url = s3_endpoint_url + self.checkpoint_dict = checkpoint_dict class S3BotoDataStore(DataStore): @@ -48,228 +34,130 @@ class S3BotoDataStore(DataStore): def __init__(self, params: S3BotoDataStoreParameters, graph_manager: MultiAgentGraphManager, ignore_lock: bool = False): self.params = params - self.key_prefixes = dict() - self.ip_data_keys = dict() - self.ip_done_keys = dict() - self.preset_data_keys = dict() - self.delete_queues = dict() - for agent_key, s3_folder in self.params.s3_folders.items(): - self.key_prefixes[agent_key] = os.path.join(s3_folder, "model") - self.ip_data_keys[agent_key] = os.path.join(s3_folder, "ip/ip.json") - self.ip_done_keys[agent_key] = os.path.join(s3_folder, "ip/done") - self.preset_data_keys[agent_key] = os.path.join(s3_folder, "presets/preset.py") - self.delete_queues[agent_key] = queue.Queue() if not graph_manager: log_and_exit("None type for graph manager", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) - self.graph_manager = graph_manager self.ignore_lock = ignore_lock - self.s3_extra_args = get_s3_kms_extra_args() - - def _get_s3_key(self, key, agent_key): - return os.path.join(self.key_prefixes[agent_key], key) - - def _get_client(self): - session = boto3.session.Session() - return session.client('s3', region_name=self.params.aws_region, endpoint_url=self.params.s3_endpoint_url, - config=get_boto_config()) + self.syncfile_lock = (list(self.params.checkpoint_dict.values())[0]).syncfile_lock def deploy(self) -> bool: return True - def get_info(self, agent_key): - return "s3://{}/{}".format(self.params.buckets[agent_key], self.params.s3_folder[agent_key]) + def get_info(self, agent_name): + return "s3://{}".format(self.params.checkpoint_dict[agent_name].s3_dir) def undeploy(self) -> bool: return True def upload_finished_file(self): + for _, checkpoint in self.params.checkpoint_dict.items(): + checkpoint.syncfile_finished.persist(s3_kms_extra_args=get_s3_kms_extra_args()) + + def save_to_store(self): try: - s3_client = self._get_client() - for agent_key, bucket in self.params.buckets.items(): - s3_client.upload_fileobj(Fileobj=io.BytesIO(b''), - Bucket=bucket, - Key=self._get_s3_key(SyncFiles.FINISHED.value, agent_key), - ExtraArgs=self.s3_extra_args) + # remove lock file if it exists + self.syncfile_lock.delete() + # acquire lock + self.syncfile_lock.persist(s3_kms_extra_args=get_s3_kms_extra_args()) + for _, checkpoint in self.params.checkpoint_dict.items(): + # upload tensorflow models, tensorflow frozen graph, and rl coach checkpoint + self._save_tf_model_to_store(checkpoint) + # release lock by delete it + self.syncfile_lock.delete() except botocore.exceptions.ClientError: - log_and_exit("Unable to upload finish file", + log_and_exit("Unable to upload checkpoint", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) except Exception as ex: - log_and_exit("Exception in uploading finish file {}".format(ex), + log_and_exit("Exception in uploading checkpoint: {}".format(ex), SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) - def save_to_store(self): + def signal_ready(self): + '''upload rl coach .ready file + ''' try: - s3_client = self._get_client() - base_checkpoint_dir = self.params.base_checkpoint_dir - for agent_key, bucket in self.params.buckets.items(): - # remove lock file if it exists - s3_client.delete_object(Bucket=bucket, Key=self._get_s3_key(SyncFiles.LOCKFILE.value, agent_key)) - - # acquire lock - s3_client.upload_fileobj(Fileobj=io.BytesIO(b''), - Bucket=bucket, - Key=self._get_s3_key(SyncFiles.LOCKFILE.value, agent_key), - ExtraArgs=self.s3_extra_args) - - checkpoint_dir = base_checkpoint_dir if len(self.graph_manager.agents_params) == 1 else \ - os.path.join(base_checkpoint_dir, agent_key) - - state_file = CheckpointStateFile(os.path.abspath(checkpoint_dir)) - ckpt_state = None - check_point_key_list = [] - if state_file.exists(): - ckpt_state = state_file.read() - checkpoint_file = None - num_files_uploaded = 0 - start_time = time.time() - for root, _, files in os.walk(checkpoint_dir): - for filename in files: - if filename == CheckpointStateFile.checkpoint_state_filename: - checkpoint_file = (root, filename) - continue - if filename.startswith(ckpt_state.name): - abs_name = os.path.abspath(os.path.join(root, filename)) - rel_name = os.path.relpath(abs_name, checkpoint_dir) - s3_client.upload_file(Filename=abs_name, - Bucket=bucket, - Key=self._get_s3_key(rel_name, agent_key), - ExtraArgs=self.s3_extra_args, - Config=boto3.s3.transfer.TransferConfig(multipart_threshold=1)) - check_point_key_list.append(self._get_s3_key(rel_name, agent_key)) - num_files_uploaded += 1 - time_taken = time.time() - start_time - LOG.info("Uploaded %s files for checkpoint %s in %.2f seconds", num_files_uploaded, ckpt_state.num, time_taken) - if check_point_key_list: - self.delete_queues[agent_key].put(check_point_key_list) - - abs_name = os.path.abspath(os.path.join(checkpoint_file[0], checkpoint_file[1])) - rel_name = os.path.relpath(abs_name, checkpoint_dir) - s3_client.upload_file(Filename=abs_name, - Bucket=bucket, - Key=self._get_s3_key(rel_name, agent_key), - ExtraArgs=self.s3_extra_args) - - # upload Finished if present - if os.path.exists(os.path.join(checkpoint_dir, SyncFiles.FINISHED.value)): - s3_client.upload_fileobj(Fileobj=io.BytesIO(b''), - Bucket=bucket, - Key=self._get_s3_key(SyncFiles.FINISHED.value, agent_key), - ExtraArgs=self.s3_extra_args) - - # upload Ready if present - if os.path.exists(os.path.join(checkpoint_dir, SyncFiles.TRAINER_READY.value)): - s3_client.upload_fileobj(Fileobj=io.BytesIO(b''), - Bucket=bucket, - Key=self._get_s3_key(SyncFiles.TRAINER_READY.value, agent_key), - ExtraArgs=self.s3_extra_args) - - # release lock - s3_client.delete_object(Bucket=bucket, - Key=self._get_s3_key(SyncFiles.LOCKFILE.value, agent_key)) - - # Upload the frozen graph which is used for deployment - if self.graph_manager: - # checkpoint state is always present for the checkpoint dir passed. - # We make same assumption while we get the best checkpoint in s3_metrics - checkpoint_num = ckpt_state.num - self.write_frozen_graph(self.graph_manager, agent_key, checkpoint_num) - frozen_name = "model_{}.pb".format(checkpoint_num) - frozen_graph_fpath = os.path.join(SM_MODEL_PB_TEMP_FOLDER, agent_key, - frozen_name) - frozen_graph_s3_name = frozen_name if len(self.graph_manager.agents_params) == 1 \ - else os.path.join(agent_key, frozen_name) - # upload the model_.pb to S3. - s3_client.upload_file(Filename=frozen_graph_fpath, - Bucket=bucket, - Key=self._get_s3_key(frozen_graph_s3_name, agent_key), - ExtraArgs=self.s3_extra_args) - LOG.info("saved intermediate frozen graph: %s", self._get_s3_key(frozen_graph_s3_name, agent_key)) - - # Copy the best checkpoint to the SM_MODEL_OUTPUT_DIR - copy_best_frozen_model_to_sm_output_dir(bucket, - self.params.s3_folders[agent_key], - self.params.aws_region, - os.path.join(SM_MODEL_PB_TEMP_FOLDER, agent_key), - os.path.join(SM_MODEL_OUTPUT_DIR, agent_key), - self.params.s3_endpoint_url) - - # Clean up old checkpoints - if ckpt_state and self.delete_queues[agent_key].qsize() > NUM_MODELS_TO_KEEP: - best_checkpoint = get_best_checkpoint(bucket, - self.params.s3_folders[agent_key], - self.params.aws_region, - self.params.s3_endpoint_url) - while self.delete_queues[agent_key].qsize() > NUM_MODELS_TO_KEEP: - key_list = self.delete_queues[agent_key].get() - if best_checkpoint and all(list(map(lambda file_name: best_checkpoint in file_name, - [os.path.split(file)[-1] for file in key_list]))): - self.delete_queues[agent_key].put(key_list) - else: - delete_iteration_ids = set() - for key in key_list: - s3_client.delete_object(Bucket=bucket, Key=key) - # Get the name of the file in the checkpoint directory that has to be deleted - # and extract the iteration id out of the name - file_in_checkpoint_dir = os.path.split(key)[-1] - if len(file_in_checkpoint_dir.split("_Step")) > 0: - delete_iteration_ids.add(file_in_checkpoint_dir.split("_Step")[0]) - LOG.info("Deleting the frozen models in s3 for the iterations: %s", - delete_iteration_ids) - # Delete the model_{}.pb files from the s3 bucket for the previous iterations - for iteration_id in list(delete_iteration_ids): - frozen_name = "model_{}.pb".format(iteration_id) - frozen_graph_s3_name = frozen_name if len(self.graph_manager.agents_params) == 1 \ - else os.path.join(agent_key, frozen_name) - s3_client.delete_object(Bucket=bucket, - Key=self._get_s3_key(frozen_graph_s3_name, agent_key)) + # remove lock file if it exists + self.syncfile_lock.delete() + # acquire lock + self.syncfile_lock.persist(s3_kms_extra_args=get_s3_kms_extra_args()) + for _, checkpoint in self.params.checkpoint_dict.items(): + # upload .ready + checkpoint.syncfile_ready.persist(s3_kms_extra_args=get_s3_kms_extra_args()) + # release lock by delete it + self.syncfile_lock.delete() except botocore.exceptions.ClientError: - log_and_exit("Unable to upload checkpoint", + log_and_exit("Unable to upload .ready", SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400) except Exception as ex: - log_and_exit("Exception in uploading checkpoint: {}".format(ex), + log_and_exit("Exception in uploading .ready file: {}".format(ex), SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) - def write_frozen_graph(self, graph_manager, agent_name, iteration_id): - """Write the frozen graph to the temporary folder with a name model_{}.pb for the iteration_id passed - Args: - graph_manager (MultiAgentGraphManager): MultiAgentGraphManager object - agent_name (str): Name of the agent - iteration_id (int): Iteration id for which we are saving the model_{}.pb - """ - if not os.path.exists(os.path.join(SM_MODEL_PB_TEMP_FOLDER, agent_name)): - os.makedirs(os.path.join(SM_MODEL_PB_TEMP_FOLDER, agent_name)) - if not os.path.exists(os.path.join(SM_MODEL_OUTPUT_DIR, agent_name)): - os.makedirs(os.path.join(SM_MODEL_OUTPUT_DIR, agent_name)) - output_head = ['main_level/{}/main/online/network_1/ppo_head_0/policy'.format(agent_name)] - frozen = tf.graph_util.convert_variables_to_constants(graph_manager.sess[agent_name], - graph_manager.sess[agent_name].graph_def, output_head) - tf.train.write_graph(frozen, os.path.join(SM_MODEL_PB_TEMP_FOLDER, agent_name), - 'model_{}.pb'.format(iteration_id), as_text=False) - - def get_chkpoint_num(self, agent_key): + def flush_finished(self): + '''upload rl coach .finished file + ''' + try: + # remove lock file if it exists + self.syncfile_lock.delete() + # acquire lock + self.syncfile_lock.persist(s3_kms_extra_args=get_s3_kms_extra_args()) + for _, checkpoint in self.params.checkpoint_dict.items(): + # upload .finished + checkpoint.syncfile_finished.persist(s3_kms_extra_args=get_s3_kms_extra_args()) + + # release lock by delete it + self.syncfile_lock.delete() + except botocore.exceptions.ClientError: + log_and_exit("Unable to upload .finished", + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + except Exception as ex: + log_and_exit("Exception in uploading .finished file: {}".format(ex), + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + + def _save_tf_model_to_store(self, checkpoint): + # rl coach .coach_checkpoint state file + state_file = checkpoint.rl_coach_checkpoint.coach_checkpoint_state_file + + # upload tensorflow models + checkpoint.tensorflow_model.persist( + coach_checkpoint_state_file=state_file, + s3_kms_extra_args=get_s3_kms_extra_args()) + + # persist rl coach checkpoint + checkpoint.rl_coach_checkpoint.persist(s3_kms_extra_args=get_s3_kms_extra_args()) + + # Upload the frozen graph which is used for deployment + if self.graph_manager: + checkpoint.tensorflow_model.persist_tensorflow_frozen_graph( + agent_name=checkpoint.agent_name, + graph_manager=self.graph_manager, + coach_checkpoint_state_file=state_file, + best_checkpoint_number=checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint_number(), + last_checkpoint_number=checkpoint.deepracer_checkpoint_json.get_deepracer_last_checkpoint_number(), + s3_kms_extra_args=get_s3_kms_extra_args()) + + # Clean up old checkpoints + checkpoint.tensorflow_model.delete( + coach_checkpoint_state_file=state_file, + best_checkpoint=checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint()) + + def get_coach_checkpoint_number(self, agent_key): try: - s3_client = self._get_client() # If there is a lock file return -1 since it means the trainer has the lock - response = s3_client.list_objects_v2(Bucket=self.params.buckets[agent_key], - Prefix=self._get_s3_key(SyncFiles.LOCKFILE.value, agent_key)) + response = self.syncfile_lock.list() chkpoint_num = -1 if "Contents" not in response: - base_checkpoint_dir = self.params.base_checkpoint_dir - checkpoint_dir = base_checkpoint_dir if len(self.graph_manager.agents_params) == 1 else os.path.join(base_checkpoint_dir, agent_key) - if not os.path.exists(checkpoint_dir): - os.makedirs(checkpoint_dir) - state_file = CheckpointStateFile(os.path.abspath(checkpoint_dir)) - s3_client.download_file(Bucket=self.params.buckets[agent_key], - Key=self._get_s3_key(state_file.filename, agent_key), - Filename=state_file.path) - checkpoint_state = state_file.read() + # download rl coach .coach_checkpoint file + self.params.checkpoint_dict[agent_key].rl_coach_checkpoint.get() + # read .coach_checkpoint file after download + checkpoint_state = \ + self.params.checkpoint_dict[agent_key].rl_coach_checkpoint.coach_checkpoint_state_file.read() if checkpoint_state is not None: chkpoint_num = checkpoint_state.num return chkpoint_num @@ -283,120 +171,25 @@ def get_chkpoint_num(self, agent_key): SIMAPP_EVENT_ERROR_CODE_500) def load_from_store(self, expected_checkpoint_number=-1): + '''download tf model, rl coach .coach_checkpoint, .finished, .ready file from s3 + + Args: + expected_checkpoint_number (int): for training, rollout worker will expect the latest + file for eval, tournament, validation, expected_checkpoint_number will always be -1 + to make sure last/best tf model can be downloaded + ''' try: - s3_client = self._get_client() - base_checkpoint_dir = self.params.base_checkpoint_dir - for agent_key, bucket in self.params.buckets.items(): - checkpoint_dir = base_checkpoint_dir if len(self.graph_manager.agents_params) == 1 else os.path.join(base_checkpoint_dir, agent_key) - if not os.path.exists(checkpoint_dir): - os.makedirs(checkpoint_dir) + for _, checkpoint in self.params.checkpoint_dict.items(): while True: - s3_client = self._get_client() - state_file = CheckpointStateFile(os.path.abspath(checkpoint_dir)) - - # wait until lock is removed - response = s3_client.list_objects_v2(Bucket=bucket, - Prefix=self._get_s3_key(SyncFiles.LOCKFILE.value, agent_key)) - if "Contents" not in response or self.ignore_lock: - try: - checkpoint_file_path = os.path.abspath(os.path.join(checkpoint_dir, - state_file.path)) - # fetch checkpoint state file from S3 - s3_client.download_file(Bucket=bucket, - Key=self._get_s3_key(state_file.filename, agent_key), - Filename=checkpoint_file_path) - except botocore.exceptions.ClientError: - if self.ignore_lock: - log_and_exit("Checkpoint not found", - SIMAPP_S3_DATA_STORE_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) - continue - except Exception: - if self.ignore_lock: - log_and_exit("Checkpoint not found", - SIMAPP_S3_DATA_STORE_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) - time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) - continue - else: - time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) + # load tf models and rl coach .coach_checkpoint from s3 store + if not self._load_tf_model_from_store(checkpoint=checkpoint, + expected_checkpoint_number=expected_checkpoint_number): continue - - # check if there's a Finished file - response = s3_client.list_objects_v2(Bucket=bucket, - Prefix=self._get_s3_key(SyncFiles.FINISHED.value, agent_key)) - if "Contents" in response: - try: - finished_file_path = os.path.abspath(os.path.join(checkpoint_dir, - SyncFiles.FINISHED.value)) - s3_client.download_file(Bucket=bucket, - Key=self._get_s3_key(SyncFiles.FINISHED.value, agent_key), - Filename=finished_file_path) - except Exception: - pass - - # check if there's a Ready file - response = s3_client.list_objects_v2(Bucket=bucket, - Prefix=self._get_s3_key(SyncFiles.TRAINER_READY.value, agent_key)) - if "Contents" in response: - try: - ready_file_path = os.path.abspath(os.path.join(checkpoint_dir, - SyncFiles.TRAINER_READY.value)) - s3_client.download_file(Bucket=bucket, - Key=self._get_s3_key(SyncFiles.TRAINER_READY.value, agent_key), - Filename=ready_file_path) - except Exception: - pass - - checkpoint_state = state_file.read() - if checkpoint_state is not None: - - # if we get a checkpoint that is older that the expected checkpoint, we wait for - # the new checkpoint to arrive. - - if checkpoint_state.num < expected_checkpoint_number: - time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) - continue - - response = s3_client.list_objects_v2(Bucket=bucket, - Prefix=self._get_s3_key("", agent_key)) - if "Contents" in response: - # Check to see if the desired checkpoint is in the bucket - has_chkpnt = any(list(map(lambda obj: os.path.split(obj['Key'])[1].\ - startswith(checkpoint_state.name), - response['Contents']))) - for obj in response["Contents"]: - full_key_prefix = os.path.normpath(self.key_prefixes[agent_key]) + "/" - filename = os.path.abspath(os.path.join(checkpoint_dir, - obj["Key"].\ - replace(full_key_prefix, ""))) - dirname, basename = os.path.split(filename) - # Download all the checkpoints but not the frozen models since they - # are not necessary - _, file_extension = os.path.splitext(obj["Key"]) - if file_extension != '.pb' \ - and (basename.startswith(checkpoint_state.name) or not has_chkpnt): - if not os.path.exists(dirname): - os.makedirs(dirname) - s3_client.download_file(Bucket=bucket, - Key=obj["Key"], - Filename=filename) - # Change the coach checkpoint file to point to the latest available checkpoint, - # also log that we are changing the checkpoint. - if not has_chkpnt: - all_ckpnts = _filter_checkpoint_files(os.listdir(checkpoint_dir)) - if all_ckpnts: - LOG.info("%s not in s3 bucket, downloading all checkpoints \ - and using %s", checkpoint_state.name, all_ckpnts[-1]) - state_file.write(all_ckpnts[-1]) - else: - log_and_exit("No checkpoint files", - SIMAPP_S3_DATA_STORE_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) + # load .finished from s3 store + self._load_syncfile_from_store(sync_file=checkpoint.syncfile_finished) + # load .ready from s3 store + self._load_syncfile_from_store(sync_file=checkpoint.syncfile_ready) break - return True - except botocore.exceptions.ClientError: log_and_exit("Unable to download checkpoint", SIMAPP_S3_DATA_STORE_EXCEPTION, @@ -405,3 +198,131 @@ def load_from_store(self, expected_checkpoint_number=-1): log_and_exit("Exception in downloading checkpoint: {}".format(ex), SIMAPP_S3_DATA_STORE_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) + + def load_trainer_ready_from_store(self): + try: + for _, checkpoint in self.params.checkpoint_dict.items(): + # load .ready from s3 store + self._load_syncfile_from_store(sync_file=checkpoint.syncfile_ready) + except botocore.exceptions.ClientError: + log_and_exit("Unable to download .ready", + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + except Exception as ex: + log_and_exit("Exception in downloading .ready: {}".format(ex), + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + + def _load_syncfile_from_store(self, sync_file): + '''download a specific sync file from s3 if exist + + Args: + sync_file (RlCoachSyncFile): RlCoachSyncFile class instance + ''' + # list rl coach sync file + response = sync_file.list() + if "Contents" in response: + try: + # download rl coach sync file + sync_file.download() + except Exception: + pass + + def _load_tf_model_from_store(self, checkpoint, expected_checkpoint_number): + '''load tf models and rl coach .coach_checkpoint from s3 store + + Args: + checkpoint (Checkpoint): Checkpoint class instance + expected_checkpoint_number (int): for training, rollout worker will expect the latest + file for eval, tournament, validation, expected_checkpoint_number will always be -1 + to make sure last/best tf model can be downloaded + + Returns: + bool: True if load tf model from store succeed. Otherwise, False + ''' + # list rl coach .lock + response = self.syncfile_lock.list() + if "Contents" not in response or self.ignore_lock: + try: + # download rl coach checkpoint + checkpoint.rl_coach_checkpoint.get() + except botocore.exceptions.ClientError: + if self.ignore_lock: + log_and_exit("Checkpoint not found", + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_400) + time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) + return False + except Exception: + if self.ignore_lock: + log_and_exit("Checkpoint not found", + SIMAPP_S3_DATA_STORE_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) + return False + else: + time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) + return False + + checkpoint_state = checkpoint.rl_coach_checkpoint.coach_checkpoint_state_file.read() + if checkpoint_state is not None: + # if we get a checkpoint that is older that the expected checkpoint, we wait for + # the new checkpoint to arrive. + if checkpoint_state.num < expected_checkpoint_number: + time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND) + return False + # download tensorflow models + checkpoint.tensorflow_model.get( + checkpoint.rl_coach_checkpoint.coach_checkpoint_state_file) + return True + + def wait_for_checkpoints(self, timeout=10): + """ + block until there is a checkpoint in all of the checkpoint_dirs + """ + for _ in range(timeout): + self.load_from_store() + all_agent_checkpoint_copied = \ + all([checkpoint.rl_coach_checkpoint.coach_checkpoint_state_file.read() is not None + for _, checkpoint in self.params.checkpoint_dict.items()]) + if all_agent_checkpoint_copied: + return + time.sleep(10) + + # one last time + all_agent_checkpoint_copied = \ + all([checkpoint.rl_coach_checkpoint.coach_checkpoint_state_file.read() is not None + for _, checkpoint in self.params.checkpoint_dict.items()]) + if all_agent_checkpoint_copied: + return + + log_and_exit("Checkpoint never found, waited {} seconds.".format(timeout), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + + def wait_for_trainer_ready(self, timeout=10): + for _ in range(timeout): + self.load_trainer_ready_from_store() + all_agent_ready_copied = \ + all(["Contents" in checkpoint.syncfile_ready.list() + for _, checkpoint in self.params.checkpoint_dict.items()]) + if all_agent_ready_copied: + return + time.sleep(10) + + # one last time + all_agent_ready_copied = \ + all(["Contents" in checkpoint.syncfile_ready.list() + for _, checkpoint in self.params.checkpoint_dict.items()]) + if all_agent_ready_copied: + return + + log_and_exit("ready never found, waited {} seconds.".format(timeout), + SIMAPP_SIMULATION_WORKER_EXCEPTION, + SIMAPP_EVENT_ERROR_CODE_500) + + def modify_checkpoint_variables(self): + for agent_name, checkpoint in self.params.checkpoint_dict.items(): + checkpoint.tensorflow_model.rename( + coach_checkpoint_state_file=checkpoint.rl_coach_checkpoint.coach_checkpoint_state_file, + agent_name=agent_name) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3_client.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3_client.py deleted file mode 100644 index 8e1be3d7..00000000 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/s3_client.py +++ /dev/null @@ -1,73 +0,0 @@ -import io -import logging -import os -import json -import time -import boto3 -import botocore -from markov.utils import get_boto_config, get_s3_kms_extra_args, test_internet_connection -from markov.log_handler.logger import Logger -from markov.log_handler.exception_handler import log_and_exit -from markov.log_handler.constants import (SIMAPP_EVENT_ERROR_CODE_500, SIMAPP_EVENT_ERROR_CODE_400, - SIMAPP_S3_DATA_STORE_EXCEPTION) - -LOG = Logger(__name__, logging.INFO).get_logger() - -class SageS3Client(): - def __init__(self, bucket=None, s3_prefix=None, aws_region=None, s3_endpoint_url=None): - self.aws_region = aws_region - self.bucket = bucket - self.s3_prefix = s3_prefix - self.s3_endpoint_url = s3_endpoint_url - self.model_checkpoints_prefix = os.path.normpath(s3_prefix + "/model/") + "/" - self.s3_extra_args = get_s3_kms_extra_args() - LOG.info("Initializing SageS3Client...") - - def get_client(self): - session = boto3.session.Session() - return session.client('s3', region_name=self.aws_region, endpoint_url=self.s3_endpoint_url, config=get_boto_config()) - - def _get_s3_key(self, key): - return os.path.normpath(self.model_checkpoints_prefix + "/" + key) - - def download_file(self, s3_key, local_path): - s3_client = self.get_client() - try: - s3_client.download_file(self.bucket, s3_key, local_path) - return True - except botocore.exceptions.ClientError as err: - # It is possible that the file isn't there in which case we should - # return fasle and let the client decide the next action - if err.response['Error']['Code'] == "404": - return False - else: - log_and_exit("Unable to download file (s3bucket: {} s3_key: {})".format(self.bucket, - s3_key), - SIMAPP_S3_DATA_STORE_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - except botocore.exceptions.ConnectTimeoutError as ex: - log_and_exit("Issue with your current VPC stack and IAM roles.\ - You might need to reset your account resources: {}".format(ex), - SIMAPP_S3_DATA_STORE_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - except Exception as ex: - log_and_exit("Exception in downloading file (s3bucket: {} s3_key: {}): {}".format(self.bucket, - s3_key, - ex), - SIMAPP_S3_DATA_STORE_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) - - def upload_file(self, s3_key, local_path): - s3_client = self.get_client() - try: - s3_client.upload_file(Filename=local_path, Bucket=self.bucket, Key=s3_key, - ExtraArgs=self.s3_extra_args) - return True - except botocore.exceptions.ClientError: - log_and_exit("Unable to upload file", - SIMAPP_S3_DATA_STORE_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - except Exception as ex: - log_and_exit("Exception in uploading file: {}".format(ex), - SIMAPP_S3_DATA_STORE_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/samples/sample_collector.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/samples/sample_collector.py index ae856eff..8fa70947 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/samples/sample_collector.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/samples/sample_collector.py @@ -1,32 +1,53 @@ import pickle import os + +from markov.utils import get_s3_kms_extra_args +from markov.s3.s3_client import S3Client from markov.log_handler.deepracer_exceptions import GenericTrainerException + class SampleCollector: """ Sample Collector class to collect sample and persist to S3. """ - def __init__(self, s3_client, s3_prefix, max_sample_count=None, sampling_frequency=None): + def __init__(self, bucket, s3_prefix, region_name, + s3_endpoint_url=None, + max_sample_count=None, sampling_frequency=None, + max_retry_attempts=5, backoff_time_sec=1.0): + '''Sample Collector class to collect sample and persist to S3. + + Args: + bucket (str): S3 bucket string + s3_prefix (str): S3 prefix string + region_name (str): S3 region name + max_sample_count (int): max sample count + sampling_frequency (int): sampleing frequency + max_retry_attempts (int): maximum number of retry attempts for S3 download/upload + backoff_time_sec (float): backoff second between each retry + ''' self.max_sample_count = max_sample_count or 0 self.sampling_frequency = sampling_frequency or 1 if self.sampling_frequency < 1: err_msg = "sampling_frequency must be larger or equal to 1. (Given: {})".format(self.sampling_frequency) raise GenericTrainerException(err_msg) - self.s3_client = s3_client self.s3_prefix = s3_prefix self._cur_sample_count = 0 self._cur_frequency = 0 + self._bucket = bucket + self._s3_client = S3Client(region_name, + s3_endpoint_url, + max_retry_attempts, + backoff_time_sec) - """ - Save given data as pickle and upload to s3. - - collector will stop persisting if the number of samples reached max_sample_count. - - collector will only persist if sampling_frequency is met. - - Args: - data (object): The sample data to pickle and upload to S3 - """ def sample(self, data): + """Save given data as pickle and upload to s3. + - collector will stop persisting if the number of samples reached max_sample_count. + - collector will only persist if sampling_frequency is met. + + Args: + data (object): The sample data to pickle and upload to S3 + """ if self._cur_sample_count >= self.max_sample_count: return self._cur_frequency += 1 @@ -42,10 +63,12 @@ def sample(self, data): raise GenericTrainerException('Failed to dump the sample data: {}'.format(ex)) try: - self.s3_client.upload_file(os.path.normpath("%s/samples/%s" % (self.s3_prefix, pickle_filename)), - pickle_filename) + self._s3_client.upload_file(bucket=self._bucket, + s3_key=os.path.normpath("%s/samples/%s" % (self.s3_prefix, + pickle_filename)), + local_path=pickle_filename, + s3_kms_extra_args=dict()) except Exception as ex: raise GenericTrainerException('Failed to upload the sample pickle file to S3: {}'.format(ex)) self._cur_frequency = 0 self._cur_sample_count += 1 - diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/tournament_worker.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/tournament_worker.py index 0b0593eb..5d9c51b1 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/tournament_worker.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/tournament_worker.py @@ -26,13 +26,11 @@ from markov.metrics.iteration_data import IterationData from markov.metrics.constants import MetricsS3Keys from markov.s3_boto_data_store import S3BotoDataStore, S3BotoDataStoreParameters -from markov.s3_client import SageS3Client from markov.sagemaker_graph_manager import get_graph_manager from markov.rollout_utils import (PhaseObserver, signal_robomaker_markov_package_ready, configure_environment_randomizer, get_robomaker_profiler_env) from markov.rospy_wrappers import ServiceProxyWrapper from markov.camera_utils import configure_camera -from markov.checkpoint_utils import TEMP_RENAME_FOLDER, wait_for_checkpoints, modify_checkpoint_variables from markov.track_geom.track_data import TrackData from markov.track_geom.utils import get_start_positions from markov.reset.constants import AgentInfo @@ -45,6 +43,7 @@ SimtraceVideoNames) from markov.s3.files.model_metadata import ModelMetadata from markov.s3.files.simtrace_video import SimtraceVideo +from markov.s3.files.checkpoint import Checkpoint from markov.s3.utils import get_s3_key from std_srvs.srv import Empty, EmptyRequest @@ -53,9 +52,6 @@ MIN_RESET_COUNT = 10000 #TODO: change when console passes float("inf") -if not os.path.exists(TEMP_RENAME_FOLDER): - os.makedirs(TEMP_RENAME_FOLDER) - IS_PROFILER_ON, PROFILER_S3_BUCKET, PROFILER_S3_PREFIX = get_robomaker_profiler_env() def tournament_worker(graph_manager, number_of_trials, task_parameters, simtrace_video_s3_writers, is_continuous, @@ -79,16 +75,15 @@ def tournament_worker(graph_manager, number_of_trials, task_parameters, simtrace subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list(), list() subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list() for agent_param in graph_manager.agents_params: - _checkpoint_dir = task_parameters.checkpoint_restore_path if len(graph_manager.agents_params) == 1 \ - else os.path.join(task_parameters.checkpoint_restore_path, agent_param.name) + _checkpoint_dir = os.path.join(task_parameters.checkpoint_restore_path, agent_param.name) agent_names.append(agent_param.name) checkpoint_dirs.append(_checkpoint_dir) racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \ else "racecar_{}".format(agent_param.name.split("_")[1]) subscribe_to_save_mp4_topic.append("/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name)) unsubscribe_from_save_mp4_topic.append("/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name)) - wait_for_checkpoints(checkpoint_dirs, graph_manager.data_store) - modify_checkpoint_variables(checkpoint_dirs, agent_names) + graph_manager.data_store.wait_for_checkpoints() + graph_manager.data_store.modify_checkpoint_variables() # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') @@ -358,6 +353,7 @@ def main(): agent_list = list() s3_bucket_dict = dict() s3_prefix_dict = dict() + checkpoint_dict = dict() start_positions = get_start_positions(len(arg_s3_bucket)) done_condition = utils.str_to_done_condition(rospy.get_param("DONE_CONDITION", any)) park_positions = utils.pos_2d_str_to_list(rospy.get_param("PARK_POSITIONS", [])) @@ -387,11 +383,6 @@ def main(): mp4_s3_bucket_dict[agent_name] = mp4_s3_bucket[agent_index] mp4_s3_object_prefix_dict[agent_name] = mp4_s3_object_prefix[agent_index] - s3_client = SageS3Client(bucket=arg_s3_bucket[agent_index], - s3_prefix=arg_s3_prefix[agent_index], - s3_endpoint_url=args.s3_endpoint_url, - aws_region=args.aws_region) - # download model metadata model_metadata = ModelMetadata(bucket=arg_s3_bucket[agent_index], s3_key=get_s3_key(arg_s3_prefix[agent_index], MODEL_METADATA_S3_POSTFIX), @@ -400,17 +391,24 @@ def main(): local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format(agent_name)) _, _, version = model_metadata.get_model_metadata_info() - if version < SIMAPP_VERSION_2 and \ - not utils.has_current_ckpnt_name(arg_s3_bucket[agent_index], arg_s3_prefix[agent_index], args.aws_region): - utils.make_compatible(arg_s3_bucket[agent_index], arg_s3_prefix[agent_index], args.aws_region, - SyncFiles.TRAINER_READY.value, - s3_endpoint_url=args.s3_endpoint_url) - - # Select the optimal model - utils.do_model_selection(s3_bucket=arg_s3_bucket[agent_index], + # checkpoint s3 instance + checkpoint = Checkpoint(bucket=arg_s3_bucket[agent_index], s3_prefix=arg_s3_prefix[agent_index], - region=args.aws_region, - s3_endpoint_url=args.s3_endpoint_url) + region_name=args.aws_region, + s3_endpoint_url=args.s3_endpoint_url, + agent_name=agent_name, + checkpoint_dir=args.local_model_directory) + # make coach checkpoint compatible + if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible(): + checkpoint.rl_coach_checkpoint.make_compatible(checkpoint.syncfile_ready) + # get best model checkpoint string + model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint() + # Select the best checkpoint model by uploading rl coach .coach_checkpoint file + checkpoint.rl_coach_checkpoint.update( + model_checkpoint_name=model_checkpoint_name, + s3_kms_extra_args=utils.get_s3_kms_extra_args()) + + checkpoint_dict[agent_name] = checkpoint agent_config = { 'model_metadata': model_metadata, @@ -494,13 +492,10 @@ def main(): enable_domain_randomization=enable_domain_randomization, done_condition=done_condition) - ds_params_instance = S3BotoDataStoreParameters(aws_region=args.aws_region, - bucket_names=s3_bucket_dict, - base_checkpoint_dir=args.local_model_directory, - s3_folders=s3_prefix_dict, - s3_endpoint_url=args.s3_endpoint_url) + ds_params_instance = S3BotoDataStoreParameters(checkpoint_dict=checkpoint_dict) - graph_manager.data_store = S3BotoDataStore(params=ds_params_instance, graph_manager=graph_manager, + graph_manager.data_store = S3BotoDataStore(params=ds_params_instance, + graph_manager=graph_manager, ignore_lock=True) graph_manager.env_params.seed = 0 diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/training_worker.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/training_worker.py index 01f3b826..cfde7efb 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/training_worker.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/training_worker.py @@ -3,6 +3,7 @@ import argparse import json import logging +import botocore import math import warnings @@ -25,17 +26,18 @@ from markov.agent_ctrl.constants import ConfigParams from markov.agents.training_agent_factory import create_training_agent from markov.s3_boto_data_store import S3BotoDataStore, S3BotoDataStoreParameters -from markov.s3_client import SageS3Client from markov.sagemaker_graph_manager import get_graph_manager from markov.samples.sample_collector import SampleCollector from markov.deepracer_memory import DeepRacerRedisPubSubMemoryBackendParameters from markov.s3.files.hyperparameters import Hyperparameters from markov.s3.files.model_metadata import ModelMetadata from markov.s3.files.ip_config import IpConfig +from markov.s3.files.checkpoint import Checkpoint from markov.s3.utils import get_s3_key from markov.s3.constants import (MODEL_METADATA_LOCAL_PATH_FORMAT, MODEL_METADATA_S3_POSTFIX, HYPERPARAMETER_S3_POSTFIX) +from markov.s3.s3_client import S3Client import tensorflow as tf tf.logging.set_verbosity(tf.logging.ERROR) @@ -43,11 +45,7 @@ logger = Logger(__name__, logging.INFO).get_logger() PRETRAINED_MODEL_DIR = "./pretrained_checkpoint" -SM_MODEL_OUTPUT_DIR = os.environ.get("ALGO_MODEL_DIR", "/opt/ml/model") -CUSTOM_FILES_PATH = "./custom_files" - -if not os.path.exists(CUSTOM_FILES_PATH): - os.makedirs(CUSTOM_FILES_PATH) +SM_MODEL_OUTPUT_DIR = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") IS_PROFILER_ON, PROFILER_S3_BUCKET, PROFILER_S3_PREFIX = utils.get_sagemaker_profiler_env() @@ -138,18 +136,15 @@ def training_worker(graph_manager, task_parameters, user_batch_size, except ValueError as err: if utils.is_user_error(err): - log_and_exit("User modified model: {}" - .format(err), + log_and_exit("User modified model: {}".format(err), SIMAPP_TRAINING_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) else: - log_and_exit("An error occured while training: {}" - .format(err), + log_and_exit("An error occured while training: {}".format(err), SIMAPP_TRAINING_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) except Exception as ex: - log_and_exit("An error occured while training: {}" - .format(ex), + log_and_exit("An error occured while training: {}".format(ex), SIMAPP_TRAINING_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) finally: @@ -172,11 +167,11 @@ def main(): help="(string) Model Metadata File S3 Key", type=str, required=False) - parser.add_argument('-c', '--checkpoint-dir', + parser.add_argument('-c', '--checkpoint_dir', help='(string) Path to a folder containing a checkpoint to write the model to.', type=str, default='./checkpoint') - parser.add_argument('--pretrained-checkpoint-dir', + parser.add_argument('--pretrained_checkpoint_dir', help='(string) Path to a folder for downloading a pre-trained model', type=str, default=PRETRAINED_MODEL_DIR) @@ -211,7 +206,7 @@ def main(): args, _ = parser.parse_known_args() logger.info("S3 bucket: %s \n S3 prefix: %s \n S3 endpoint URL: %s", args.s3_bucket, args.s3_prefix, args.s3_endpoint_url) - s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region, s3_endpoint_url=args.s3_endpoint_url) + s3_client = S3Client(region_name=args.aws_region, s3_endpoint_url=args.s3_endpoint_url, max_retry_attempts=0) # download model metadata # TODO: replace 'agent' with name of each agent @@ -235,14 +230,22 @@ def main(): success_custom_preset = False if args.preset_s3_key: preset_local_path = "./markov/presets/preset.py" - success_custom_preset = s3_client.download_file(s3_key=args.preset_s3_key, local_path=preset_local_path) + try: + s3_client.download_file(bucket=args.s3_bucket, + s3_key=args.preset_s3_key, + local_path=preset_local_path) + success_custom_preset = True + except botocore.exceptions.ClientError: + pass if not success_custom_preset: logger.info("Could not download the preset file. Using the default DeepRacer preset.") else: preset_location = "markov.presets.preset:graph_manager" graph_manager = short_dynamic_import(preset_location, ignore_module_case=True) - success_custom_preset = s3_client.upload_file( - s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), local_path=preset_local_path) + s3_client.upload_file(bucket=args.s3_bucket, + s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), + local_path=preset_local_path, + s3_kms_extra_args=utils.get_s3_kms_extra_args()) if success_custom_preset: logger.info("Using preset: %s" % args.preset_s3_key) @@ -257,13 +260,13 @@ def main(): #! TODO each agent should have own config agent_config = {'model_metadata': model_metadata_download, ConfigParams.CAR_CTRL_CONFIG.value: {ConfigParams.LINK_NAME_LIST.value: [], - ConfigParams.VELOCITY_LIST.value : {}, - ConfigParams.STEERING_LIST.value : {}, - ConfigParams.CHANGE_START.value : None, - ConfigParams.ALT_DIR.value : None, - ConfigParams.ACTION_SPACE_PATH.value : model_metadata_download.local_path, - ConfigParams.REWARD.value : None, - ConfigParams.AGENT_NAME.value : 'racecar'}} + ConfigParams.VELOCITY_LIST.value: {}, + ConfigParams.STEERING_LIST.value: {}, + ConfigParams.CHANGE_START.value: None, + ConfigParams.ALT_DIR.value: None, + ConfigParams.ACTION_SPACE_PATH.value: model_metadata_download.local_path, + ConfigParams.REWARD.value: None, + ConfigParams.AGENT_NAME.value: 'racecar'}} agent_list = list() agent_list.append(create_training_agent(agent_config)) @@ -283,7 +286,10 @@ def main(): # Attach sample collector to graph_manager only if sample count > 0 max_sample_count = int(sm_hyperparams_dict.get("max_sample_count", 0)) if max_sample_count > 0: - sample_collector = SampleCollector(s3_client=s3_client, s3_prefix=args.s3_prefix, + sample_collector = SampleCollector(bucket=args.s3_bucket, + s3_prefix=args.s3_prefix, + region_name=args.aws_region, + s3_endpoint_url=args.s3_endpoint_url, max_sample_count=max_sample_count, sampling_frequency=int(sm_hyperparams_dict.get("sampling_frequency", 1))) graph_manager.sample_collector = sample_collector @@ -298,21 +304,27 @@ def main(): use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix # Handle backward compatibility if use_pretrained_model: - if version < SIMAPP_VERSION_2 and \ - not utils.has_current_ckpnt_name(args.pretrained_s3_bucket, args.pretrained_s3_prefix, args.aws_region, args.s3_endpoint_url): - utils.make_compatible(args.pretrained_s3_bucket, args.pretrained_s3_prefix, - args.aws_region, SyncFiles.TRAINER_READY.value) - #Select the optimal model for the starting weights - utils.do_model_selection(s3_bucket=args.s3_bucket, - s3_prefix=args.s3_prefix, - region=args.aws_region, - s3_endpoint_url=args.s3_endpoint_url) - - ds_params_instance_pretrained = S3BotoDataStoreParameters(aws_region=args.aws_region, - bucket_names={'agent':args.pretrained_s3_bucket}, - base_checkpoint_dir=args.pretrained_checkpoint_dir, - s3_folders={'agent':args.pretrained_s3_prefix}, - s3_endpoint_url=args.s3_endpoint_url) + # checkpoint s3 instance for pretrained model + # TODO: replace 'agent' for multiagent training + checkpoint = Checkpoint(bucket=args.pretrained_s3_bucket, + s3_prefix=args.pretrained_s3_prefix, + region_name=args.aws_region, + s3_endpoint_url=args.s3_endpoint_url, + agent_name='agent', + checkpoint_dir=args.pretrained_checkpoint_dir) + # make coach checkpoint compatible + if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible(): + checkpoint.rl_coach_checkpoint.make_compatible(checkpoint.syncfile_ready) + # get best model checkpoint string + model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint() + # Select the best checkpoint model by uploading rl coach .coach_checkpoint file + checkpoint.rl_coach_checkpoint.update( + model_checkpoint_name=model_checkpoint_name, + s3_kms_extra_args=utils.get_s3_kms_extra_args()) + # add checkpoint into checkpoint_dict + checkpoint_dict = {'agent': checkpoint} + # load pretrained model + ds_params_instance_pretrained = S3BotoDataStoreParameters(checkpoint_dict=checkpoint_dict) data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained, graph_manager, True) data_store_pretrained.load_from_store() @@ -324,11 +336,15 @@ def main(): graph_manager.memory_backend_params = memory_backend_params - ds_params_instance = S3BotoDataStoreParameters(aws_region=args.aws_region, - bucket_names={'agent':args.s3_bucket}, - base_checkpoint_dir=args.checkpoint_dir, - s3_folders={'agent':args.s3_prefix}, - s3_endpoint_url=args.s3_endpoint_url) + # checkpoint s3 instance for training model + checkpoint = Checkpoint(bucket=args.s3_bucket, + s3_prefix=args.s3_prefix, + region_name=args.aws_region, + s3_endpoint_url=args.s3_endpoint_url, + agent_name='agent', + checkpoint_dir=args.checkpoint_dir) + checkpoint_dict = {'agent': checkpoint} + ds_params_instance = S3BotoDataStoreParameters(checkpoint_dict=checkpoint_dict) graph_manager.data_store_params = ds_params_instance diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/utils.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/utils.py index b97acf19..613e6dd6 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/utils.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/utils.py @@ -19,7 +19,6 @@ from markov.log_handler.exception_handler import log_and_exit, simapp_exit_gracefully from markov.log_handler.deepracer_exceptions import GenericException from markov.constants import (ROBOMAKER_CANCEL_JOB_WAIT_TIME, - CHKPNT_KEY_SUFFIX, DEEPRACER_CHKPNT_KEY_SUFFIX, NUM_RETRIES, CONNECT_TIMEOUT, BEST_CHECKPOINT, LAST_CHECKPOINT, SAGEMAKER_S3_KMS_CMK_ARN, ROBOMAKER_S3_KMS_CMK_ARN, S3_KMS_CMK_ARN_ENV, HYPERPARAMETERS, SAGEMAKER_IS_PROFILER_ON, @@ -31,6 +30,23 @@ logger = Logger(__name__, logging.INFO).get_logger() + +def is_int_repr(val): + '''check whether input is int or not + + Args: + val (str): str input + + Returns: + bool: True if string can be convert to int, False otherwise + ''' + try: + int(val) + return True + except ValueError: + return False + + def force_list(val): if type(val) is not list: val = [val] @@ -102,329 +118,6 @@ def get_ip_from_host(timeout=100): SIMAPP_EVENT_ERROR_CODE_500) return ip_address -def load_model_metadata(s3_client, model_metadata_s3_key, model_metadata_local_path): - """Loads the model metadata. - """ - - # Try to download the custom model metadata from s3 first - download_success = False - if not model_metadata_s3_key: - logger.info("Custom model metadata key not provided, using defaults.") - else: - # Strip the s3:// prefix if it exists - model_metadata_s3_key = model_metadata_s3_key.replace('s3://{}/'.format(s3_client.bucket), '') - download_success = s3_client.download_file(s3_key=model_metadata_s3_key, - local_path=model_metadata_local_path) - if download_success: - logger.info("Successfully downloaded model metadata from {}.".format(model_metadata_s3_key)) - else: - logger.info("Could not download custom model metadata from {}, using defaults.".format(model_metadata_s3_key)) - - # If the download was successful, validate the contents - if download_success: - try: - with open(model_metadata_local_path, 'r') as f: - model_metadata = json.load(f) - if 'action_space' not in model_metadata: - logger.info("Custom model metadata does not define an action space.") - download_success = False - except Exception: - logger.info("Could not download custom model metadata, using defaults.") - - # If the download was unsuccessful, load the default model metadata instead - if not download_success: - from markov.defaults import model_metadata - with open(model_metadata_local_path, 'w') as f: - json.dump(model_metadata, f, indent=4) - logger.info("Loaded default action space.") - - - - -def get_best_checkpoint_num(s3_bucket, s3_prefix, region, s3_endpoint_url=None): - """Get the checkpoint number of the best checkpoint if its available, else return last checkpoint - Args: - s3_bucket (str): S3 bucket where the deepracer_checkpoints.json is stored - s3_prefix (str): S3 prefix where the deepracer_checkpoints.json is stored - region (str): AWS region where the deepracer_checkpoints.json is stored - Returns: - int: Best checkpoint or last checkpoint number if found else return -1 - """ - checkpoint_num = -1 - best_checkpoint_name = get_best_checkpoint(s3_bucket, s3_prefix, region, s3_endpoint_url) - if best_checkpoint_name and len(best_checkpoint_name.split("_Step")) > 0: - checkpoint_num = int(best_checkpoint_name.split("_Step")[0]) - else: - logger.info("Unable to find the best checkpoint number. Getting the last checkpoint number") - checkpoint_num = get_last_checkpoint_num(s3_bucket, s3_prefix, region, s3_endpoint_url) - return checkpoint_num - - -def get_last_checkpoint_num(s3_bucket, s3_prefix, region, s3_endpoint_url=None): - """Get the checkpoint number of the last checkpoint. - Args: - s3_bucket (str): S3 bucket where the deepracer_checkpoints.json is stored - s3_prefix (str): S3 prefix where the deepracer_checkpoints.json is stored - region (str): AWS region where the deepracer_checkpoints.json is stored - Returns: - int: Last checkpoint number if found else return -1 - """ - checkpoint_num = -1 - # Get the last checkpoint name from the deepracer_checkpoints.json file - last_checkpoint_name = get_last_checkpoint(s3_bucket, s3_prefix, region, s3_endpoint_url) - # Verify if the last checkpoint name is present and is in right format - if last_checkpoint_name and len(last_checkpoint_name.split("_Step")) > 0: - checkpoint_num = int(last_checkpoint_name.split("_Step")[0]) - else: - logger.info("Unable to find the last checkpoint number.") - return checkpoint_num - - -def copy_best_frozen_model_to_sm_output_dir(s3_bucket, s3_prefix, region, - source_dir, dest_dir, s3_endpoint_url=None): - """Copy the frozen model for the current best checkpoint from soure directory to the destination directory. - Args: - s3_bucket (str): S3 bucket where the deepracer_checkpoints.json is stored - s3_prefix (str): S3 prefix where the deepracer_checkpoints.json is stored - region (str): AWS region where the deepracer_checkpoints.json is stored - source_dir (str): Source directory where the frozen models are present - dest_dir (str): Sagemaker output directory where we store the frozen models for best checkpoint - """ - dest_dir_pb_files = [filename for filename in os.listdir(dest_dir) - if os.path.isfile(os.path.join(dest_dir, filename)) and filename.endswith(".pb")] - source_dir_pb_files = [filename for filename in os.listdir(source_dir) - if os.path.isfile(os.path.join(source_dir, filename)) and filename.endswith(".pb")] - best_checkpoint_num_s3 = get_best_checkpoint_num(s3_bucket, - s3_prefix, - region, - s3_endpoint_url) - last_checkpoint_num_s3 = get_last_checkpoint_num(s3_bucket, - s3_prefix, - region, - s3_endpoint_url) - logger.info("Best checkpoint number: {}, Last checkpoint number: {}" - .format(best_checkpoint_num_s3, last_checkpoint_num_s3)) - best_model_name = 'model_{}.pb'.format(best_checkpoint_num_s3) - last_model_name = 'model_{}.pb'.format(last_checkpoint_num_s3) - if len(source_dir_pb_files) < 1: - log_and_exit("Could not find any frozen model file in the local directory", - SIMAPP_S3_DATA_STORE_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) - try: - # Could not find the deepracer_checkpoints.json file or there are no model.pb files in destination - if best_checkpoint_num_s3 == -1 or len(dest_dir_pb_files) == 0: - if len(source_dir_pb_files) > 1: - logger.info("More than one model.pb found in the source directory. Choosing the " - "first one to copy to destination: {}".format(source_dir_pb_files[0])) - # copy the frozen model present in the source directory - logger.info("Copying the frozen checkpoint from {} to {}.".format( - os.path.join(source_dir, source_dir_pb_files[0]), os.path.join(dest_dir, "model.pb"))) - shutil.copy(os.path.join(source_dir, source_dir_pb_files[0]), os.path.join(dest_dir, "model.pb")) - else: - # Delete the current .pb files in the destination direcory - for filename in dest_dir_pb_files: - os.remove(os.path.join(dest_dir, filename)) - - # Copy the frozen model for the current best checkpoint to the destination directory - logger.info("Copying the frozen checkpoint from {} to {}.".format( - os.path.join(source_dir, best_model_name), os.path.join(dest_dir, "model.pb"))) - shutil.copy(os.path.join(source_dir, best_model_name), os.path.join(dest_dir, "model.pb")) - - # Loop through the current list of frozen models in source directory and - # delete the iterations lower than last_checkpoint_iteration except best_model - for filename in source_dir_pb_files: - if filename not in [best_model_name, last_model_name]: - if len(filename.split("_")[1]) > 1 and len(filename.split("_")[1].split(".pb")): - file_iteration = int(filename.split("_")[1].split(".pb")[0]) - if file_iteration < last_checkpoint_num_s3: - os.remove(os.path.join(source_dir, filename)) - else: - logger.error("Frozen model name not in the right format in the source directory: {}, {}" - .format(filename, source_dir)) - except FileNotFoundError as err: - log_and_exit("No such file or directory: {}".format(err), - SIMAPP_S3_DATA_STORE_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - - -def get_best_checkpoint(s3_bucket, s3_prefix, region, s3_endpoint_url=None): - return get_deepracer_checkpoint(s3_bucket=s3_bucket, - s3_prefix=s3_prefix, - region=region, - checkpoint_type=BEST_CHECKPOINT, - s3_endpoint_url=s3_endpoint_url) - - -def get_last_checkpoint(s3_bucket, s3_prefix, region, s3_endpoint_url=None): - return get_deepracer_checkpoint(s3_bucket=s3_bucket, - s3_prefix=s3_prefix, - region=region, - checkpoint_type=LAST_CHECKPOINT, - s3_endpoint_url=s3_endpoint_url) - - -def get_deepracer_checkpoint(s3_bucket, s3_prefix, region, checkpoint_type, s3_endpoint_url=None): - '''Returns the best checkpoint stored in the best checkpoint json - s3_bucket - DeepRacer s3 bucket - s3_prefix - Prefix for the training job for which to select the best model for - region - Name of the aws region where the job ran - checkpoint_type - BEST_CHECKPOINT/LAST_CHECKPOINT - ''' - try: - session = boto3.Session() - s3_client = session.client('s3', region_name=region, endpoint_url=s3_endpoint_url, config=get_boto_config()) - # Download the best model if available - deepracer_checkpoint_json = os.path.join(os.getcwd(), 'deepracer_checkpoints.json') - s3_client.download_file(Bucket=s3_bucket, - Key=os.path.join(s3_prefix, DEEPRACER_CHKPNT_KEY_SUFFIX), - Filename=deepracer_checkpoint_json) - except botocore.exceptions.ClientError as err: - if err.response['Error']['Code'] == "404": - logger.info("Unable to find best model data, using last model") - return None - else: - log_and_exit("Unable to download best checkpoint: {}, {}".\ - format(s3_bucket, err.response['Error']['Code']), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - except Exception as ex: - log_and_exit("Can't download best checkpoint: {}" - .format(ex), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) - try: - with open(deepracer_checkpoint_json) as deepracer_checkpoint_file: - checkpoint = json.load(deepracer_checkpoint_file)[checkpoint_type]["name"] - if not checkpoint: - raise Exception("No checkpoint recorded") - os.remove(deepracer_checkpoint_json) - except Exception as ex: - logger.info("Unable to parse best checkpoint data: {}, using last \ - checkpoint instead".format(ex)) - return None - return checkpoint - - -def do_model_selection(s3_bucket, s3_prefix, region, checkpoint_type=BEST_CHECKPOINT, s3_endpoint_url=None): - '''Sets the chekpoint file to point at the best model based on reward and progress - s3_bucket - DeepRacer s3 bucket - s3_prefix - Prefix for the training job for which to select the best model for - region - Name of the aws region where the job ran - - :returns status of model selection. True if successfully selected model otherwise false. - ''' - try: - s3_extra_args = get_s3_kms_extra_args() - model_checkpoint = get_deepracer_checkpoint(s3_bucket=s3_bucket, - s3_prefix=s3_prefix, - region=region, - checkpoint_type=checkpoint_type, - s3_endpoint_url=s3_endpoint_url) - # check model checkpoint is present and is type string - if model_checkpoint is None or not isinstance(model_checkpoint, str): - logger.info("Exit model selection because model_checkpoint is {} of type {}".\ - format(model_checkpoint, type(model_checkpoint))) - return False - local_path = os.path.abspath(os.path.join(os.getcwd(), 'coach_checkpoint')) - with open(local_path, '+w') as new_ckpnt: - new_ckpnt.write(model_checkpoint) - s3_client = boto3.Session().client('s3', region_name=region, endpoint_url=s3_endpoint_url, config=get_boto_config()) - s3_client.upload_file(Filename=local_path, - Bucket=s3_bucket, - Key=os.path.join(s3_prefix, CHKPNT_KEY_SUFFIX), - ExtraArgs=s3_extra_args) - os.remove(local_path) - return True - except botocore.exceptions.ClientError as err: - log_and_exit("Unable to upload checkpoint: {}, {}" - .format(s3_bucket, err.response['Error']['Code']), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - except Exception as ex: - log_and_exit("Exception in uploading checkpoint: {}" - .format(ex), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) - -def has_current_ckpnt_name(s3_bucket, s3_prefix, region, s3_endpoint_url=None): - '''This method checks if a given s3 bucket contains the current checkpoint key - s3_bucket - DeepRacer s3 bucket - s3_prefix - Prefix for the training job for which to select the best model for - region - Name of the aws region where the job ran - ''' - try: - session = boto3.Session() - s3_client = session.client('s3', region_name=region, endpoint_url=s3_endpoint_url, config=get_boto_config()) - response = s3_client.list_objects_v2(Bucket=s3_bucket, - Prefix=os.path.join(s3_prefix, "model")) - if 'Contents' not in response: - # Customer deleted checkpoint file. - log_and_exit("No objects found: {}" - .format(s3_bucket), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - - _, ckpnt_name = os.path.split(CHKPNT_KEY_SUFFIX) - return any(list(map(lambda obj: os.path.split(obj['Key'])[1] == ckpnt_name, - response['Contents']))) - except botocore.exceptions.ClientError as e: - log_and_exit("No objects found: {}, {}" - .format(s3_bucket, e.response['Error']['Code']), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - except Exception as e: - log_and_exit("Exception in checking for current checkpoint key: {}" - .format(e), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) - -def make_compatible(s3_bucket, s3_prefix, region, ready_file, s3_endpoint_url=None): - '''Moves and creates all the necessary files to make models trained by coach 0.11 - compatible with coach 1.0 - s3_bucket - DeepRacer s3 bucket - s3_prefix - Prefix for the training job for which to select the best model for - region - Name of the aws region where the job ran - ''' - try: - session = boto3.Session() - s3_client = session.client('s3', region_name=region, endpoint_url=s3_endpoint_url, config=get_boto_config()) - - s3_extra_args = get_s3_kms_extra_args() - old_checkpoint = os.path.join(os.getcwd(), 'checkpoint') - s3_client.download_file(Bucket=s3_bucket, - Key=os.path.join(s3_prefix, 'model/checkpoint'), - Filename=old_checkpoint) - - with open(old_checkpoint) as old_checkpoint_file: - chekpoint = re.findall(r'"(.*?)"', old_checkpoint_file.readline()) - if len(chekpoint) != 1: - log_and_exit("No checkpoint file found", - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - os.remove(old_checkpoint) - # Upload ready file so that the system can gab the checkpoints - s3_client.upload_fileobj(Fileobj=io.BytesIO(b''), - Bucket=s3_bucket, - Key=os.path.join(s3_prefix, "model/{}").format(ready_file), - ExtraArgs=s3_extra_args) - # Upload the new checkpoint file - new_checkpoint = os.path.join(os.getcwd(), 'coach_checkpoint') - with open(new_checkpoint, 'w+') as new_checkpoint_file: - new_checkpoint_file.write(chekpoint[0]) - s3_client.upload_file(Filename=new_checkpoint, Bucket=s3_bucket, - Key=os.path.join(s3_prefix, CHKPNT_KEY_SUFFIX), - ExtraArgs=s3_extra_args) - os.remove(new_checkpoint) - except botocore.exceptions.ClientError as e: - log_and_exit("Unable to make model compatible: {}, {}" - .format(s3_bucket, e.response['Error']['Code']), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_400) - except Exception as e: - log_and_exit("Exception in making model compatible: {}" - .format(e), - SIMAPP_SIMULATION_WORKER_EXCEPTION, - SIMAPP_EVENT_ERROR_CODE_500) def is_user_error(error): ''' Helper method that determines whether a value error is caused by an invalid checkpoint @@ -488,7 +181,7 @@ def get_racecar_idx(racecar_name): SIMAPP_EVENT_ERROR_CODE_500) def get_s3_kms_extra_args(): - """ Since the SageS3Client class is called by both robomaker and sagemaker. One has to know + """ Since the S3Client class is called by both robomaker and sagemaker. One has to know first if its coming from sagemaker or robomaker. Then alone I could decide to fetch the kms arn to encrypt all the S3 upload object. Return the extra args that is required to encrypt the s3 object with KMS key If the KMS key not passed then returns empty dict diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/validation_worker.py b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/validation_worker.py index 43f5e6a6..052f9acd 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/validation_worker.py +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/markov/validation_worker.py @@ -17,7 +17,6 @@ from rl_coach.base_parameters import TaskParameters from rl_coach.logger import screen from rl_coach.data_stores.data_store import SyncFiles -from rl_coach.rollout_worker import wait_for_checkpoint from rl_coach.core_types import EnvironmentSteps, RunPhase from markov import utils @@ -34,6 +33,7 @@ from markov.sagemaker_graph_manager import get_graph_manager from markov.architecture.constants import Input from markov.s3.files.model_metadata import ModelMetadata +from markov.s3.files.checkpoint import Checkpoint from markov.s3.utils import get_s3_key from markov.s3.constants import MODEL_METADATA_S3_POSTFIX @@ -46,29 +46,30 @@ def _validate(graph_manager, task_parameters, transitions, s3_bucket, s3_prefix, aws_region): + checkpoint = graph_manager.data_store.params.checkpoint_dict['agent'] checkpoint_dir = task_parameters.checkpoint_restore_path - wait_for_checkpoint(checkpoint_dir, graph_manager.data_store) - - if utils.do_model_selection(s3_bucket=s3_bucket, - s3_prefix=s3_prefix, - region=aws_region, - checkpoint_type=LAST_CHECKPOINT): - screen.log_title(" Validating Last Checkpoint: {}".format(utils.get_last_checkpoint(s3_bucket, - s3_prefix, - aws_region))) + graph_manager.data_store.wait_for_checkpoints() + + # validate last checkpoint + last_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_last_checkpoint() + if checkpoint.rl_coach_checkpoint.update( + model_checkpoint_name=last_model_checkpoint_name, + s3_kms_extra_args=utils.get_s3_kms_extra_args()): + screen.log_title(" Validating Last Checkpoint: {}".format(last_model_checkpoint_name)) + # load the last rl coach checkpoint from store + graph_manager.data_store.load_from_store() graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint") graph_manager.emulate_act_on_trainer(EnvironmentSteps(1), transitions=transitions) screen.log_title(" emulate_act_on_trainer on Last Checkpoint completed!") - # Best checkpoint might not exist. - if utils.do_model_selection(s3_bucket=s3_bucket, - s3_prefix=s3_prefix, - region=aws_region, - checkpoint_type=BEST_CHECKPOINT): - screen.log_title(" Validating Best Checkpoint: {}".format(utils.get_best_checkpoint(s3_bucket, - s3_prefix, - aws_region))) + # validate best checkpoint: Best checkpoint might not exist. + best_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint() + if checkpoint.rl_coach_checkpoint.update( + model_checkpoint_name=best_model_checkpoint_name, + s3_kms_extra_args=utils.get_s3_kms_extra_args()): + screen.log_title(" Validating Best Checkpoint: {}".format(best_model_checkpoint_name)) + # load the best rl coach checkpoint from store graph_manager.data_store.load_from_store() graph_manager.restore_checkpoint() screen.log_title(" Start emulate_act_on_trainer on Best Checkpoint") @@ -79,6 +80,8 @@ def _validate(graph_manager, task_parameters, transitions, else: screen.log_title(" Validating Last Checkpoint") + # load the last rl coach checkpoint from store + graph_manager.data_store.load_from_store() graph_manager.create_graph(task_parameters) graph_manager.phase = RunPhase.TEST screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint ") @@ -141,19 +144,26 @@ def validate(s3_bucket, s3_prefix, aws_region): # Thus, call this get_transition_data function before create_traning_agent function! transitions = get_transition_data(observation_list) - if version < SIMAPP_VERSION_2 and \ - not utils.has_current_ckpnt_name(s3_bucket, s3_prefix, aws_region): - utils.make_compatible(s3_bucket, s3_prefix, aws_region, SyncFiles.TRAINER_READY.value) + checkpoint = Checkpoint(bucket=s3_bucket, + s3_prefix=s3_prefix, + region_name=args.aws_region, + agent_name='agent', + checkpoint_dir=LOCAL_MODEL_DIR) + # make coach checkpoint compatible + if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible(): + checkpoint.rl_coach_checkpoint.make_compatible(checkpoint.syncfile_ready) + # add checkpoint into checkpoint_dict + checkpoint_dict = {'agent': checkpoint} agent_config = {'model_metadata': model_metadata, ConfigParams.CAR_CTRL_CONFIG.value: {ConfigParams.LINK_NAME_LIST.value: [], - ConfigParams.VELOCITY_LIST.value: {}, - ConfigParams.STEERING_LIST.value: {}, - ConfigParams.CHANGE_START.value: None, - ConfigParams.ALT_DIR.value: None, - ConfigParams.ACTION_SPACE_PATH.value: model_metadata.local_path, - ConfigParams.REWARD.value: None, - ConfigParams.AGENT_NAME.value: 'racecar'}} + ConfigParams.VELOCITY_LIST.value: {}, + ConfigParams.STEERING_LIST.value: {}, + ConfigParams.CHANGE_START.value: None, + ConfigParams.ALT_DIR.value: None, + ConfigParams.ACTION_SPACE_PATH.value: model_metadata.local_path, + ConfigParams.REWARD.value: None, + ConfigParams.AGENT_NAME.value: 'racecar'}} agent_list = list() agent_list.append(create_training_agent(agent_config)) @@ -163,10 +173,7 @@ def validate(s3_bucket, s3_prefix, aws_region): agent_list=agent_list, run_phase_subject=None) - ds_params_instance = S3BotoDataStoreParameters(aws_region=aws_region, - bucket_names={'agent': s3_bucket}, - s3_folders={'agent': s3_prefix}, - base_checkpoint_dir=LOCAL_MODEL_DIR) + ds_params_instance = S3BotoDataStoreParameters(checkpoint_dict=checkpoint_dict) graph_manager.data_store = S3BotoDataStore(ds_params_instance, graph_manager, ignore_lock=True) diff --git a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/sagemaker_rl_agent-0.0.1-py3.5.egg-info/SOURCES.txt b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/sagemaker_rl_agent-0.0.1-py3.5.egg-info/SOURCES.txt index 6c02285a..c8688079 100644 --- a/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/sagemaker_rl_agent-0.0.1-py3.5.egg-info/SOURCES.txt +++ b/bundle/sagemaker_rl_agent/lib/python3.5/site-packages/sagemaker_rl_agent-0.0.1-py3.5.egg-info/SOURCES.txt @@ -9,7 +9,6 @@ setup.py /opt/amazon/markov/agent_ctrl/utils.py /opt/amazon/markov/agent_ctrl/../__init__.py /opt/amazon/markov/agent_ctrl/../camera_utils.py -/opt/amazon/markov/agent_ctrl/../checkpoint_utils.py /opt/amazon/markov/agent_ctrl/../common.py /opt/amazon/markov/agent_ctrl/../constants.py /opt/amazon/markov/agent_ctrl/../deepracer_memory.py @@ -20,7 +19,6 @@ setup.py /opt/amazon/markov/agent_ctrl/../rollout_worker.py /opt/amazon/markov/agent_ctrl/../rospy_wrappers.py /opt/amazon/markov/agent_ctrl/../s3_boto_data_store.py -/opt/amazon/markov/agent_ctrl/../s3_client.py /opt/amazon/markov/agent_ctrl/../sagemaker_graph_manager.py /opt/amazon/markov/agent_ctrl/../tournament_worker.py /opt/amazon/markov/agent_ctrl/../training_worker.py @@ -33,7 +31,6 @@ setup.py /opt/amazon/markov/agents/utils.py /opt/amazon/markov/agents/../__init__.py /opt/amazon/markov/agents/../camera_utils.py -/opt/amazon/markov/agents/../checkpoint_utils.py /opt/amazon/markov/agents/../common.py /opt/amazon/markov/agents/../constants.py /opt/amazon/markov/agents/../deepracer_memory.py @@ -44,7 +41,6 @@ setup.py /opt/amazon/markov/agents/../rollout_worker.py /opt/amazon/markov/agents/../rospy_wrappers.py /opt/amazon/markov/agents/../s3_boto_data_store.py -/opt/amazon/markov/agents/../s3_client.py /opt/amazon/markov/agents/../sagemaker_graph_manager.py /opt/amazon/markov/agents/../tournament_worker.py /opt/amazon/markov/agents/../training_worker.py @@ -57,7 +53,6 @@ setup.py /opt/amazon/markov/architecture/embedder_factory.py /opt/amazon/markov/architecture/../__init__.py /opt/amazon/markov/architecture/../camera_utils.py -/opt/amazon/markov/architecture/../checkpoint_utils.py /opt/amazon/markov/architecture/../common.py /opt/amazon/markov/architecture/../constants.py /opt/amazon/markov/architecture/../deepracer_memory.py @@ -68,7 +63,6 @@ setup.py /opt/amazon/markov/architecture/../rollout_worker.py /opt/amazon/markov/architecture/../rospy_wrappers.py /opt/amazon/markov/architecture/../s3_boto_data_store.py -/opt/amazon/markov/architecture/../s3_client.py /opt/amazon/markov/architecture/../sagemaker_graph_manager.py /opt/amazon/markov/architecture/../tournament_worker.py /opt/amazon/markov/architecture/../training_worker.py @@ -84,7 +78,6 @@ setup.py /opt/amazon/markov/cameras/utils.py /opt/amazon/markov/cameras/../__init__.py /opt/amazon/markov/cameras/../camera_utils.py -/opt/amazon/markov/cameras/../checkpoint_utils.py /opt/amazon/markov/cameras/../common.py /opt/amazon/markov/cameras/../constants.py /opt/amazon/markov/cameras/../deepracer_memory.py @@ -95,7 +88,6 @@ setup.py /opt/amazon/markov/cameras/../rollout_worker.py /opt/amazon/markov/cameras/../rospy_wrappers.py /opt/amazon/markov/cameras/../s3_boto_data_store.py -/opt/amazon/markov/cameras/../s3_client.py /opt/amazon/markov/cameras/../sagemaker_graph_manager.py /opt/amazon/markov/cameras/../tournament_worker.py /opt/amazon/markov/cameras/../training_worker.py @@ -106,7 +98,6 @@ setup.py /opt/amazon/markov/cameras/handlers/top_camera.py /opt/amazon/markov/cameras/handlers/../../__init__.py /opt/amazon/markov/cameras/handlers/../../camera_utils.py -/opt/amazon/markov/cameras/handlers/../../checkpoint_utils.py /opt/amazon/markov/cameras/handlers/../../common.py /opt/amazon/markov/cameras/handlers/../../constants.py /opt/amazon/markov/cameras/handlers/../../deepracer_memory.py @@ -117,7 +108,6 @@ setup.py /opt/amazon/markov/cameras/handlers/../../rollout_worker.py /opt/amazon/markov/cameras/handlers/../../rospy_wrappers.py /opt/amazon/markov/cameras/handlers/../../s3_boto_data_store.py -/opt/amazon/markov/cameras/handlers/../../s3_client.py /opt/amazon/markov/cameras/handlers/../../sagemaker_graph_manager.py /opt/amazon/markov/cameras/handlers/../../tournament_worker.py /opt/amazon/markov/cameras/handlers/../../training_worker.py @@ -129,7 +119,6 @@ setup.py /opt/amazon/markov/domain_randomizations/randomizer_manager.py /opt/amazon/markov/domain_randomizations/../__init__.py /opt/amazon/markov/domain_randomizations/../camera_utils.py -/opt/amazon/markov/domain_randomizations/../checkpoint_utils.py /opt/amazon/markov/domain_randomizations/../common.py /opt/amazon/markov/domain_randomizations/../constants.py /opt/amazon/markov/domain_randomizations/../deepracer_memory.py @@ -140,7 +129,6 @@ setup.py /opt/amazon/markov/domain_randomizations/../rollout_worker.py /opt/amazon/markov/domain_randomizations/../rospy_wrappers.py /opt/amazon/markov/domain_randomizations/../s3_boto_data_store.py -/opt/amazon/markov/domain_randomizations/../s3_client.py /opt/amazon/markov/domain_randomizations/../sagemaker_graph_manager.py /opt/amazon/markov/domain_randomizations/../tournament_worker.py /opt/amazon/markov/domain_randomizations/../training_worker.py @@ -151,7 +139,6 @@ setup.py /opt/amazon/markov/domain_randomizations/visual/model_visual_randomizer.py /opt/amazon/markov/domain_randomizations/visual/../../__init__.py /opt/amazon/markov/domain_randomizations/visual/../../camera_utils.py -/opt/amazon/markov/domain_randomizations/visual/../../checkpoint_utils.py /opt/amazon/markov/domain_randomizations/visual/../../common.py /opt/amazon/markov/domain_randomizations/visual/../../constants.py /opt/amazon/markov/domain_randomizations/visual/../../deepracer_memory.py @@ -162,7 +149,6 @@ setup.py /opt/amazon/markov/domain_randomizations/visual/../../rollout_worker.py /opt/amazon/markov/domain_randomizations/visual/../../rospy_wrappers.py /opt/amazon/markov/domain_randomizations/visual/../../s3_boto_data_store.py -/opt/amazon/markov/domain_randomizations/visual/../../s3_client.py /opt/amazon/markov/domain_randomizations/visual/../../sagemaker_graph_manager.py /opt/amazon/markov/domain_randomizations/visual/../../tournament_worker.py /opt/amazon/markov/domain_randomizations/visual/../../training_worker.py @@ -173,7 +159,6 @@ setup.py /opt/amazon/markov/environments/deepracer_racetrack_env.py /opt/amazon/markov/environments/../__init__.py /opt/amazon/markov/environments/../camera_utils.py -/opt/amazon/markov/environments/../checkpoint_utils.py /opt/amazon/markov/environments/../common.py /opt/amazon/markov/environments/../constants.py /opt/amazon/markov/environments/../deepracer_memory.py @@ -184,7 +169,6 @@ setup.py /opt/amazon/markov/environments/../rollout_worker.py /opt/amazon/markov/environments/../rospy_wrappers.py /opt/amazon/markov/environments/../s3_boto_data_store.py -/opt/amazon/markov/environments/../s3_client.py /opt/amazon/markov/environments/../sagemaker_graph_manager.py /opt/amazon/markov/environments/../tournament_worker.py /opt/amazon/markov/environments/../training_worker.py @@ -193,7 +177,6 @@ setup.py /opt/amazon/markov/filters/__init__.py /opt/amazon/markov/filters/../__init__.py /opt/amazon/markov/filters/../camera_utils.py -/opt/amazon/markov/filters/../checkpoint_utils.py /opt/amazon/markov/filters/../common.py /opt/amazon/markov/filters/../constants.py /opt/amazon/markov/filters/../deepracer_memory.py @@ -204,7 +187,6 @@ setup.py /opt/amazon/markov/filters/../rollout_worker.py /opt/amazon/markov/filters/../rospy_wrappers.py /opt/amazon/markov/filters/../s3_boto_data_store.py -/opt/amazon/markov/filters/../s3_client.py /opt/amazon/markov/filters/../sagemaker_graph_manager.py /opt/amazon/markov/filters/../tournament_worker.py /opt/amazon/markov/filters/../training_worker.py @@ -214,7 +196,6 @@ setup.py /opt/amazon/markov/filters/observation/observation_binary_filter.py /opt/amazon/markov/filters/observation/../../__init__.py /opt/amazon/markov/filters/observation/../../camera_utils.py -/opt/amazon/markov/filters/observation/../../checkpoint_utils.py /opt/amazon/markov/filters/observation/../../common.py /opt/amazon/markov/filters/observation/../../constants.py /opt/amazon/markov/filters/observation/../../deepracer_memory.py @@ -225,7 +206,6 @@ setup.py /opt/amazon/markov/filters/observation/../../rollout_worker.py /opt/amazon/markov/filters/observation/../../rospy_wrappers.py /opt/amazon/markov/filters/observation/../../s3_boto_data_store.py -/opt/amazon/markov/filters/observation/../../s3_client.py /opt/amazon/markov/filters/observation/../../sagemaker_graph_manager.py /opt/amazon/markov/filters/observation/../../tournament_worker.py /opt/amazon/markov/filters/observation/../../training_worker.py @@ -237,7 +217,6 @@ setup.py /opt/amazon/markov/gazebo_tracker/tracker_manager.py /opt/amazon/markov/gazebo_tracker/../__init__.py /opt/amazon/markov/gazebo_tracker/../camera_utils.py -/opt/amazon/markov/gazebo_tracker/../checkpoint_utils.py /opt/amazon/markov/gazebo_tracker/../common.py /opt/amazon/markov/gazebo_tracker/../constants.py /opt/amazon/markov/gazebo_tracker/../deepracer_memory.py @@ -248,7 +227,6 @@ setup.py /opt/amazon/markov/gazebo_tracker/../rollout_worker.py /opt/amazon/markov/gazebo_tracker/../rospy_wrappers.py /opt/amazon/markov/gazebo_tracker/../s3_boto_data_store.py -/opt/amazon/markov/gazebo_tracker/../s3_client.py /opt/amazon/markov/gazebo_tracker/../sagemaker_graph_manager.py /opt/amazon/markov/gazebo_tracker/../tournament_worker.py /opt/amazon/markov/gazebo_tracker/../training_worker.py @@ -262,7 +240,6 @@ setup.py /opt/amazon/markov/gazebo_tracker/trackers/set_visual_transparency_tracker.py /opt/amazon/markov/gazebo_tracker/trackers/../../__init__.py /opt/amazon/markov/gazebo_tracker/trackers/../../camera_utils.py -/opt/amazon/markov/gazebo_tracker/trackers/../../checkpoint_utils.py /opt/amazon/markov/gazebo_tracker/trackers/../../common.py /opt/amazon/markov/gazebo_tracker/trackers/../../constants.py /opt/amazon/markov/gazebo_tracker/trackers/../../deepracer_memory.py @@ -273,7 +250,6 @@ setup.py /opt/amazon/markov/gazebo_tracker/trackers/../../rollout_worker.py /opt/amazon/markov/gazebo_tracker/trackers/../../rospy_wrappers.py /opt/amazon/markov/gazebo_tracker/trackers/../../s3_boto_data_store.py -/opt/amazon/markov/gazebo_tracker/trackers/../../s3_client.py /opt/amazon/markov/gazebo_tracker/trackers/../../sagemaker_graph_manager.py /opt/amazon/markov/gazebo_tracker/trackers/../../tournament_worker.py /opt/amazon/markov/gazebo_tracker/trackers/../../training_worker.py @@ -286,7 +262,6 @@ setup.py /opt/amazon/markov/log_handler/logger.py /opt/amazon/markov/log_handler/../__init__.py /opt/amazon/markov/log_handler/../camera_utils.py -/opt/amazon/markov/log_handler/../checkpoint_utils.py /opt/amazon/markov/log_handler/../common.py /opt/amazon/markov/log_handler/../constants.py /opt/amazon/markov/log_handler/../deepracer_memory.py @@ -297,7 +272,6 @@ setup.py /opt/amazon/markov/log_handler/../rollout_worker.py /opt/amazon/markov/log_handler/../rospy_wrappers.py /opt/amazon/markov/log_handler/../s3_boto_data_store.py -/opt/amazon/markov/log_handler/../s3_client.py /opt/amazon/markov/log_handler/../sagemaker_graph_manager.py /opt/amazon/markov/log_handler/../tournament_worker.py /opt/amazon/markov/log_handler/../training_worker.py @@ -307,7 +281,6 @@ setup.py /opt/amazon/markov/memories/deepracer_memory.py /opt/amazon/markov/memories/../__init__.py /opt/amazon/markov/memories/../camera_utils.py -/opt/amazon/markov/memories/../checkpoint_utils.py /opt/amazon/markov/memories/../common.py /opt/amazon/markov/memories/../constants.py /opt/amazon/markov/memories/../deepracer_memory.py @@ -318,7 +291,6 @@ setup.py /opt/amazon/markov/memories/../rollout_worker.py /opt/amazon/markov/memories/../rospy_wrappers.py /opt/amazon/markov/memories/../s3_boto_data_store.py -/opt/amazon/markov/memories/../s3_client.py /opt/amazon/markov/memories/../sagemaker_graph_manager.py /opt/amazon/markov/memories/../tournament_worker.py /opt/amazon/markov/memories/../training_worker.py @@ -331,7 +303,6 @@ setup.py /opt/amazon/markov/metrics/s3_metrics.py /opt/amazon/markov/metrics/../__init__.py /opt/amazon/markov/metrics/../camera_utils.py -/opt/amazon/markov/metrics/../checkpoint_utils.py /opt/amazon/markov/metrics/../common.py /opt/amazon/markov/metrics/../constants.py /opt/amazon/markov/metrics/../deepracer_memory.py @@ -342,7 +313,6 @@ setup.py /opt/amazon/markov/metrics/../rollout_worker.py /opt/amazon/markov/metrics/../rospy_wrappers.py /opt/amazon/markov/metrics/../s3_boto_data_store.py -/opt/amazon/markov/metrics/../s3_client.py /opt/amazon/markov/metrics/../sagemaker_graph_manager.py /opt/amazon/markov/metrics/../tournament_worker.py /opt/amazon/markov/metrics/../training_worker.py @@ -354,7 +324,6 @@ setup.py /opt/amazon/markov/multi_agent_coach/multi_agent_level_manager.py /opt/amazon/markov/multi_agent_coach/../__init__.py /opt/amazon/markov/multi_agent_coach/../camera_utils.py -/opt/amazon/markov/multi_agent_coach/../checkpoint_utils.py /opt/amazon/markov/multi_agent_coach/../common.py /opt/amazon/markov/multi_agent_coach/../constants.py /opt/amazon/markov/multi_agent_coach/../deepracer_memory.py @@ -365,7 +334,6 @@ setup.py /opt/amazon/markov/multi_agent_coach/../rollout_worker.py /opt/amazon/markov/multi_agent_coach/../rospy_wrappers.py /opt/amazon/markov/multi_agent_coach/../s3_boto_data_store.py -/opt/amazon/markov/multi_agent_coach/../s3_client.py /opt/amazon/markov/multi_agent_coach/../sagemaker_graph_manager.py /opt/amazon/markov/multi_agent_coach/../tournament_worker.py /opt/amazon/markov/multi_agent_coach/../training_worker.py @@ -378,7 +346,6 @@ setup.py /opt/amazon/markov/reset/utils.py /opt/amazon/markov/reset/../__init__.py /opt/amazon/markov/reset/../camera_utils.py -/opt/amazon/markov/reset/../checkpoint_utils.py /opt/amazon/markov/reset/../common.py /opt/amazon/markov/reset/../constants.py /opt/amazon/markov/reset/../deepracer_memory.py @@ -389,7 +356,6 @@ setup.py /opt/amazon/markov/reset/../rollout_worker.py /opt/amazon/markov/reset/../rospy_wrappers.py /opt/amazon/markov/reset/../s3_boto_data_store.py -/opt/amazon/markov/reset/../s3_client.py /opt/amazon/markov/reset/../sagemaker_graph_manager.py /opt/amazon/markov/reset/../tournament_worker.py /opt/amazon/markov/reset/../training_worker.py @@ -403,7 +369,6 @@ setup.py /opt/amazon/markov/reset/rules/reverse_reset_rule.py /opt/amazon/markov/reset/rules/../../__init__.py /opt/amazon/markov/reset/rules/../../camera_utils.py -/opt/amazon/markov/reset/rules/../../checkpoint_utils.py /opt/amazon/markov/reset/rules/../../common.py /opt/amazon/markov/reset/rules/../../constants.py /opt/amazon/markov/reset/rules/../../deepracer_memory.py @@ -414,7 +379,6 @@ setup.py /opt/amazon/markov/reset/rules/../../rollout_worker.py /opt/amazon/markov/reset/rules/../../rospy_wrappers.py /opt/amazon/markov/reset/rules/../../s3_boto_data_store.py -/opt/amazon/markov/reset/rules/../../s3_client.py /opt/amazon/markov/reset/rules/../../sagemaker_graph_manager.py /opt/amazon/markov/reset/rules/../../tournament_worker.py /opt/amazon/markov/reset/rules/../../training_worker.py @@ -426,7 +390,6 @@ setup.py /opt/amazon/markov/s3/utils.py /opt/amazon/markov/s3/../__init__.py /opt/amazon/markov/s3/../camera_utils.py -/opt/amazon/markov/s3/../checkpoint_utils.py /opt/amazon/markov/s3/../common.py /opt/amazon/markov/s3/../constants.py /opt/amazon/markov/s3/../deepracer_memory.py @@ -437,13 +400,13 @@ setup.py /opt/amazon/markov/s3/../rollout_worker.py /opt/amazon/markov/s3/../rospy_wrappers.py /opt/amazon/markov/s3/../s3_boto_data_store.py -/opt/amazon/markov/s3/../s3_client.py /opt/amazon/markov/s3/../sagemaker_graph_manager.py /opt/amazon/markov/s3/../tournament_worker.py /opt/amazon/markov/s3/../training_worker.py /opt/amazon/markov/s3/../utils.py /opt/amazon/markov/s3/../validation_worker.py /opt/amazon/markov/s3/files/__init__.py +/opt/amazon/markov/s3/files/checkpoint.py /opt/amazon/markov/s3/files/hyperparameters.py /opt/amazon/markov/s3/files/ip_config.py /opt/amazon/markov/s3/files/metrics.py @@ -453,7 +416,6 @@ setup.py /opt/amazon/markov/s3/files/yaml_file.py /opt/amazon/markov/s3/files/../../__init__.py /opt/amazon/markov/s3/files/../../camera_utils.py -/opt/amazon/markov/s3/files/../../checkpoint_utils.py /opt/amazon/markov/s3/files/../../common.py /opt/amazon/markov/s3/files/../../constants.py /opt/amazon/markov/s3/files/../../deepracer_memory.py @@ -464,17 +426,37 @@ setup.py /opt/amazon/markov/s3/files/../../rollout_worker.py /opt/amazon/markov/s3/files/../../rospy_wrappers.py /opt/amazon/markov/s3/files/../../s3_boto_data_store.py -/opt/amazon/markov/s3/files/../../s3_client.py /opt/amazon/markov/s3/files/../../sagemaker_graph_manager.py /opt/amazon/markov/s3/files/../../tournament_worker.py /opt/amazon/markov/s3/files/../../training_worker.py /opt/amazon/markov/s3/files/../../utils.py /opt/amazon/markov/s3/files/../../validation_worker.py +/opt/amazon/markov/s3/files/checkpoint_files/__init__.py +/opt/amazon/markov/s3/files/checkpoint_files/deepracer_checkpoint_json.py +/opt/amazon/markov/s3/files/checkpoint_files/rl_coach_checkpoint.py +/opt/amazon/markov/s3/files/checkpoint_files/rl_coach_sync_file.py +/opt/amazon/markov/s3/files/checkpoint_files/tensorflow_model.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../__init__.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../camera_utils.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../common.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../constants.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../deepracer_memory.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../defaults.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../evaluation_worker.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../rollout_constants.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../rollout_utils.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../rollout_worker.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../rospy_wrappers.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../s3_boto_data_store.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../sagemaker_graph_manager.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../tournament_worker.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../training_worker.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../utils.py +/opt/amazon/markov/s3/files/checkpoint_files/../../../validation_worker.py /opt/amazon/markov/samples/__init__.py /opt/amazon/markov/samples/sample_collector.py /opt/amazon/markov/samples/../__init__.py /opt/amazon/markov/samples/../camera_utils.py -/opt/amazon/markov/samples/../checkpoint_utils.py /opt/amazon/markov/samples/../common.py /opt/amazon/markov/samples/../constants.py /opt/amazon/markov/samples/../deepracer_memory.py @@ -485,7 +467,6 @@ setup.py /opt/amazon/markov/samples/../rollout_worker.py /opt/amazon/markov/samples/../rospy_wrappers.py /opt/amazon/markov/samples/../s3_boto_data_store.py -/opt/amazon/markov/samples/../s3_client.py /opt/amazon/markov/samples/../sagemaker_graph_manager.py /opt/amazon/markov/samples/../tournament_worker.py /opt/amazon/markov/samples/../training_worker.py @@ -499,7 +480,6 @@ setup.py /opt/amazon/markov/sensors/utils.py /opt/amazon/markov/sensors/../__init__.py /opt/amazon/markov/sensors/../camera_utils.py -/opt/amazon/markov/sensors/../checkpoint_utils.py /opt/amazon/markov/sensors/../common.py /opt/amazon/markov/sensors/../constants.py /opt/amazon/markov/sensors/../deepracer_memory.py @@ -510,7 +490,6 @@ setup.py /opt/amazon/markov/sensors/../rollout_worker.py /opt/amazon/markov/sensors/../rospy_wrappers.py /opt/amazon/markov/sensors/../s3_boto_data_store.py -/opt/amazon/markov/sensors/../s3_client.py /opt/amazon/markov/sensors/../sagemaker_graph_manager.py /opt/amazon/markov/sensors/../tournament_worker.py /opt/amazon/markov/sensors/../training_worker.py @@ -522,7 +501,6 @@ setup.py /opt/amazon/markov/track_geom/utils.py /opt/amazon/markov/track_geom/../__init__.py /opt/amazon/markov/track_geom/../camera_utils.py -/opt/amazon/markov/track_geom/../checkpoint_utils.py /opt/amazon/markov/track_geom/../common.py /opt/amazon/markov/track_geom/../constants.py /opt/amazon/markov/track_geom/../deepracer_memory.py @@ -533,7 +511,6 @@ setup.py /opt/amazon/markov/track_geom/../rollout_worker.py /opt/amazon/markov/track_geom/../rospy_wrappers.py /opt/amazon/markov/track_geom/../s3_boto_data_store.py -/opt/amazon/markov/track_geom/../s3_client.py /opt/amazon/markov/track_geom/../sagemaker_graph_manager.py /opt/amazon/markov/track_geom/../tournament_worker.py /opt/amazon/markov/track_geom/../training_worker.py @@ -545,7 +522,6 @@ setup.py /opt/amazon/markov/track_geom/spline/track_spline.py /opt/amazon/markov/track_geom/spline/../../__init__.py /opt/amazon/markov/track_geom/spline/../../camera_utils.py -/opt/amazon/markov/track_geom/spline/../../checkpoint_utils.py /opt/amazon/markov/track_geom/spline/../../common.py /opt/amazon/markov/track_geom/spline/../../constants.py /opt/amazon/markov/track_geom/spline/../../deepracer_memory.py @@ -556,7 +532,6 @@ setup.py /opt/amazon/markov/track_geom/spline/../../rollout_worker.py /opt/amazon/markov/track_geom/spline/../../rospy_wrappers.py /opt/amazon/markov/track_geom/spline/../../s3_boto_data_store.py -/opt/amazon/markov/track_geom/spline/../../s3_client.py /opt/amazon/markov/track_geom/spline/../../sagemaker_graph_manager.py /opt/amazon/markov/track_geom/spline/../../tournament_worker.py /opt/amazon/markov/track_geom/spline/../../training_worker.py @@ -567,7 +542,6 @@ setup.py /opt/amazon/markov/visual_effects/effect_manager.py /opt/amazon/markov/visual_effects/../__init__.py /opt/amazon/markov/visual_effects/../camera_utils.py -/opt/amazon/markov/visual_effects/../checkpoint_utils.py /opt/amazon/markov/visual_effects/../common.py /opt/amazon/markov/visual_effects/../constants.py /opt/amazon/markov/visual_effects/../deepracer_memory.py @@ -578,7 +552,6 @@ setup.py /opt/amazon/markov/visual_effects/../rollout_worker.py /opt/amazon/markov/visual_effects/../rospy_wrappers.py /opt/amazon/markov/visual_effects/../s3_boto_data_store.py -/opt/amazon/markov/visual_effects/../s3_client.py /opt/amazon/markov/visual_effects/../sagemaker_graph_manager.py /opt/amazon/markov/visual_effects/../tournament_worker.py /opt/amazon/markov/visual_effects/../training_worker.py @@ -588,7 +561,6 @@ setup.py /opt/amazon/markov/visual_effects/effects/blink_effect.py /opt/amazon/markov/visual_effects/effects/../../__init__.py /opt/amazon/markov/visual_effects/effects/../../camera_utils.py -/opt/amazon/markov/visual_effects/effects/../../checkpoint_utils.py /opt/amazon/markov/visual_effects/effects/../../common.py /opt/amazon/markov/visual_effects/effects/../../constants.py /opt/amazon/markov/visual_effects/effects/../../deepracer_memory.py @@ -599,7 +571,6 @@ setup.py /opt/amazon/markov/visual_effects/effects/../../rollout_worker.py /opt/amazon/markov/visual_effects/effects/../../rospy_wrappers.py /opt/amazon/markov/visual_effects/effects/../../s3_boto_data_store.py -/opt/amazon/markov/visual_effects/effects/../../s3_client.py /opt/amazon/markov/visual_effects/effects/../../sagemaker_graph_manager.py /opt/amazon/markov/visual_effects/effects/../../tournament_worker.py /opt/amazon/markov/visual_effects/effects/../../training_worker.py @@ -609,7 +580,6 @@ setup.py /opt/amazon/markov/visualizations/reward_distributions.py /opt/amazon/markov/visualizations/../__init__.py /opt/amazon/markov/visualizations/../camera_utils.py -/opt/amazon/markov/visualizations/../checkpoint_utils.py /opt/amazon/markov/visualizations/../common.py /opt/amazon/markov/visualizations/../constants.py /opt/amazon/markov/visualizations/../deepracer_memory.py @@ -620,7 +590,6 @@ setup.py /opt/amazon/markov/visualizations/../rollout_worker.py /opt/amazon/markov/visualizations/../rospy_wrappers.py /opt/amazon/markov/visualizations/../s3_boto_data_store.py -/opt/amazon/markov/visualizations/../s3_client.py /opt/amazon/markov/visualizations/../sagemaker_graph_manager.py /opt/amazon/markov/visualizations/../tournament_worker.py /opt/amazon/markov/visualizations/../training_worker.py