Skip to content

Commit

Permalink
August 31 updates
Browse files Browse the repository at this point in the history
  • Loading branch information
larsll committed Sep 30, 2020
1 parent 0fe34e9 commit 48c2065
Show file tree
Hide file tree
Showing 30 changed files with 1,866 additions and 1,145 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from markov.s3.constants import (MODEL_METADATA_LOCAL_PATH_FORMAT, MODEL_METADATA_S3_POSTFIX,
YAML_LOCAL_PATH_FORMAT, AgentType, YamlKey)
from markov.s3.utils import get_s3_key
from markov.s3.s3_client import S3Client

logger = Logger(__name__, logging.INFO).get_logger()
# Amount of time to wait to guarantee that RoboMaker's network configuration is ready.
Expand Down Expand Up @@ -143,7 +144,7 @@ def main():
# create boto3 session/client and download yaml/json file
session = boto3.session.Session()
s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL", None)
s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url, config=get_boto_config())
s3_client = S3Client(region_name=s3_region,s3_endpoint_url=s3_endpoint_url)

# Intermediate tournament files
queue_pickle_name = 'tournament_candidate_queue.pkl'
Expand All @@ -158,12 +159,13 @@ def main():
final_report_s3_key = os.path.normpath(os.path.join(s3_prefix, final_report_name))

try:
s3_client.download_file(Bucket=s3_bucket,
Key=queue_pickle_s3_key,
Filename=local_queue_pickle_path)
s3_client.download_file(Bucket=s3_bucket,
Key=report_pickle_s3_key,
Filename=local_report_pickle_path)
s3_client.download_file(bucket=s3_bucket,
s3_key=queue_pickle_s3_key,
local_path=local_queue_pickle_path)

s3_client.download_file(bucket=s3_bucket,
s3_key=report_pickle_s3_key,
local_path=local_report_pickle_path)
except:
pass

Expand Down Expand Up @@ -307,15 +309,18 @@ def main():
# Persist latest queue and report to use after job restarts.
with open(local_queue_pickle_path, 'wb') as f:
pickle.dump(tournament_candidate_queue, f, protocol=2)
s3_client.upload_file(Filename=local_queue_pickle_path,
Bucket=s3_bucket,
Key=queue_pickle_s3_key, ExtraArgs=s3_extra_args)
s3_client.upload_file(bucket=s3_bucket,
s3_key=queue_pickle_s3_key,
local_path=local_queue_pickle_path,
s3_kms_extra_args=s3_extra_args)

with open(local_report_pickle_path, 'wb') as f:
pickle.dump(tournament_report, f, protocol=2)
s3_client.upload_file(Filename=local_report_pickle_path,
Bucket=s3_bucket,
Key=report_pickle_s3_key, ExtraArgs=s3_extra_args)

s3_client.upload_file(bucket=s3_bucket,
s3_key=report_pickle_s3_key,
local_path=local_report_pickle_path,
s3_kms_extra_args=s3_extra_args)

# If there is more than 1 candidates then restart the simulation job otherwise
# tournament is finished, persists final report and ends the job.
Expand All @@ -326,9 +331,10 @@ def main():
else:
# Persist final tournament report in json format
# and terminate the job by canceling it
s3_client.put_object(Bucket=s3_bucket,
Key=final_report_s3_key,
Body=json.dumps(tournament_report), **s3_extra_args)
s3_client.put_object(bucket=s3_bucket,
s3_key=final_report_s3_key,
body=json.dumps(tournament_report),
s3_kms_extra_args=s3_extra_args)

cancel_simulation_job(os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
s3_region)
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@
# The robomaker team has asked us to wait 5 minutes to let their workflow cancel
# the simulation job
ROBOMAKER_CANCEL_JOB_WAIT_TIME = 60 * 5
# The current checkpoint key
CHKPNT_KEY_SUFFIX = "model/.coach_checkpoint"
# This is the key for the best checkpoint
DEEPRACER_CHKPNT_KEY_SUFFIX = "model/deepracer_checkpoints.json"
# The number of times to retry a failed boto call
NUM_RETRIES = 5
# The time in seconds till a timeout exception is thrown when attempting to make a connection
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
configure_environment_randomizer, get_robomaker_profiler_env)
from markov.rospy_wrappers import ServiceProxyWrapper
from markov.camera_utils import configure_camera
from markov.checkpoint_utils import TEMP_RENAME_FOLDER, wait_for_checkpoints, modify_checkpoint_variables
from markov.track_geom.track_data import TrackData
from markov.track_geom.utils import get_start_positions
from markov.s3.constants import (MODEL_METADATA_LOCAL_PATH_FORMAT,
Expand All @@ -42,6 +41,7 @@
SimtraceVideoNames)
from markov.s3.files.model_metadata import ModelMetadata
from markov.s3.files.simtrace_video import SimtraceVideo
from markov.s3.files.checkpoint import Checkpoint
from markov.s3.utils import get_s3_key

from std_srvs.srv import Empty, EmptyRequest
Expand All @@ -50,9 +50,6 @@

MIN_RESET_COUNT = 10000 #TODO: change when console passes float("inf")

if not os.path.exists(TEMP_RENAME_FOLDER):
os.makedirs(TEMP_RENAME_FOLDER)

IS_PROFILER_ON, PROFILER_S3_BUCKET, PROFILER_S3_PREFIX = get_robomaker_profiler_env()

def evaluation_worker(graph_manager, number_of_trials, task_parameters, simtrace_video_s3_writers, is_continuous,
Expand All @@ -71,21 +68,15 @@ def evaluation_worker(graph_manager, number_of_trials, task_parameters, simtrace
# Collect profiler information only IS_PROFILER_ON is true
with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET, s3_prefix=PROFILER_S3_PREFIX,
output_local_path=ROLLOUT_WORKER_PROFILER_PATH, enable_profiling=IS_PROFILER_ON):
checkpoint_dirs = list()
agent_names = list()
subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list(), list()
subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list()
for agent_param in graph_manager.agents_params:
_checkpoint_dir = task_parameters.checkpoint_restore_path if len(graph_manager.agents_params) == 1 \
else os.path.join(task_parameters.checkpoint_restore_path, agent_param.name)
agent_names.append(agent_param.name)
checkpoint_dirs.append(_checkpoint_dir)
racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \
else "racecar_{}".format(agent_param.name.split("_")[1])
subscribe_to_save_mp4_topic.append("/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name))
unsubscribe_from_save_mp4_topic.append("/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name))
wait_for_checkpoints(checkpoint_dirs, graph_manager.data_store)
modify_checkpoint_variables(checkpoint_dirs, agent_names)
graph_manager.data_store.wait_for_checkpoints()
graph_manager.data_store.modify_checkpoint_variables()

# Make the clients that will allow us to pause and unpause the physics
rospy.wait_for_service('/gazebo/pause_physics_dr')
Expand Down Expand Up @@ -253,6 +244,7 @@ def main():
agent_list = list()
s3_bucket_dict = dict()
s3_prefix_dict = dict()
checkpoint_dict = dict()
start_positions = get_start_positions(len(arg_s3_bucket))
done_condition = utils.str_to_done_condition(rospy.get_param("DONE_CONDITION", any))
park_positions = utils.pos_2d_str_to_list(rospy.get_param("PARK_POSITIONS", []))
Expand All @@ -274,11 +266,24 @@ def main():
_, _, version = model_metadata.get_model_metadata_info()


# Select the optimal model
utils.do_model_selection(s3_bucket=arg_s3_bucket[agent_index],
# checkpoint s3 instance
checkpoint = Checkpoint(bucket=arg_s3_bucket[agent_index],
s3_prefix=arg_s3_prefix[agent_index],
region=args.aws_region,
s3_endpoint_url=args.s3_endpoint_url)
region_name=args.aws_region,
s3_endpoint_url=args.s3_endpoint_url,
agent_name=agent_name,
checkpoint_dir=args.local_model_directory)
# make coach checkpoint compatible
if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible():
checkpoint.rl_coach_checkpoint.make_compatible(checkpoint.syncfile_ready)
# get best model checkpoint string
model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint()
# Select the best checkpoint model by uploading rl coach .coach_checkpoint file
checkpoint.rl_coach_checkpoint.update(
model_checkpoint_name=model_checkpoint_name,
s3_kms_extra_args=utils.get_s3_kms_extra_args())

checkpoint_dict[agent_name] = checkpoint

agent_config = {
'model_metadata': model_metadata,
Expand Down Expand Up @@ -364,13 +369,10 @@ def main():
enable_domain_randomization=enable_domain_randomization,
done_condition=done_condition)

ds_params_instance = S3BotoDataStoreParameters(aws_region=args.aws_region,
bucket_names=s3_bucket_dict,
base_checkpoint_dir=args.local_model_directory,
s3_folders=s3_prefix_dict,
s3_endpoint_url=args.s3_endpoint_url)
ds_params_instance = S3BotoDataStoreParameters(checkpoint_dict=checkpoint_dict)

graph_manager.data_store = S3BotoDataStore(params=ds_params_instance, graph_manager=graph_manager,
graph_manager.data_store = S3BotoDataStore(params=ds_params_instance,
graph_manager=graph_manager,
ignore_lock=True)
graph_manager.env_params.seed = 0

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@
32: "No objects found",
33: "No checkpoint file found",
34: "Unable to make model compatible",
35: "Checkpoint never found in",
36: "Download params and launch of agent node S3 ClientError",
35: "Checkpoint never found",
36: "Failed to parse model_metadata file",
37: "Validation worker value error",
38: "Unable to write metrics to s3: bucket",
39: "Unable to write metrics to s3, exception",
Expand Down Expand Up @@ -107,7 +107,15 @@
80: "Exception in putting objects",
81: "Unable to upload fileobj",
82: "Unable to list objects",
83: "Unable to put object"
83: "Unable to put object",
84: "Exception in uploading .finished file",
85: "Exception in uploading .lock file",
86: "Exception in uploading .ready file",
87: "Unable to delete object from s3",
88: "Can't download deepracer checkpoint json",
89: "ready never found",
90: "Exception in downloading .ready",
91: "Unable to paginate from s3"
}

# New error yet to be mapped
Expand Down
Loading

0 comments on commit 48c2065

Please sign in to comment.