In [7]:
import sys
import importlib
import os
sys.path.append('..')
import boto3
import json
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
from datetime import datetime
import os
import pprint
import subprocess
import yaml

working_dir = "../sagemaker_jobs"
os.makedirs(working_dir, exist_ok=True)

In [8]:
# Login to ECR and select training image
region = "us-west-2"
set_region = f"aws configure set region {region} --profile default"
_ = subprocess.check_output(set_region, shell=True)

user_id = "jbsnyder"
account_call = f"aws sts get-caller-identity --region {region} --endpoint-url https://sts.{region}.amazonaws.com --query Account --output text"
ecr_account = subprocess.check_output(account_call, shell=True).decode().strip()
ecr_repo = user_id
algo_name = "smcv-pt-1.8"
docker_image = "{0}.dkr.ecr.{1}.amazonaws.com/{2}:{3}".format(ecr_account,
                                                              region,
                                                              ecr_repo,
                                                              algo_name)

# Get Sagemaker execution role

try:
    role = get_execution_role()
except:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='{}-ec2'.format(user_id))
date_str = datetime.now().strftime("%d-%m-%Y")

In [9]:
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
batch_size = 256
nodes = 4
dist_name = "torch" if nodes==1 else "smd"
model = "crcnn"
config_name = f"configs/{dist_name}_{model}_bs{batch_size}.yaml"
with open(config_name, 'r') as config_file_reader:
    configuration = yaml.safe_load(config_file_reader)

# Set directory and entry point

source_dir = "."
main_script = "launch_torch.py"

# Instance configuration

instance_type = "ml.p3.16xlarge"
processes_per_host = 8 if instance_type in ['ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge'] else 4
instance_count = nodes

# Distribution Strategy

distribution_type = configuration['DISTRIBUTION']

assert distribution_type in ['torch', 'smd']

if distribution_type=="smd":
    distribution = { "smdistributed": { "dataparallel": { "enabled": True } } } 
    main_script = "train.py"
elif distribution_type=="torch":
    distribution = None

job_name = '{}-{}'.format(user_id, time_str)

s3_bucket = "jbsnyder-sagemaker-pdx" if region=="us-west-2" else "jbsnyder-sagemaker-iad"
s3_path = os.path.join('s3://{}/'.format(s3_bucket))

output_path = os.path.join(s3_path, "sagemaker-output", date_str, job_name)

code_location = os.path.join(s3_path, "sagemaker_code", date_str, job_name)

# data input channels
s3_data_dir = "s3://{}/data/".format(s3_bucket)
s3_coco_dir = "coco/2017/archive/"
s3_weights_dir = "weights/"

channels = {
    'coco': os.path.join(s3_data_dir, s3_coco_dir),
    'weights': os.path.join(s3_data_dir, s3_weights_dir),
}

hyperparameters = {"config": config_name,
                   "unarchive": '/opt/ml/input/data/coco/'}

estimator = PyTorch(
                entry_point=main_script, 
                source_dir=source_dir, 
                image_uri=docker_image, 
                role=role,
                instance_count=instance_count,
                instance_type=instance_type,
                distribution=distribution,
                output_path=output_path,
                checkpoint_s3_uri=output_path,
                model_dir=output_path,
                hyperparameters=hyperparameters,
                volume_size=500,
                disable_profiler=True,
                debugger_hook_config=False,
                code_location=code_location
)

job_desc = {"job_name": job_name, 
            "config_name": config_name,
            "config": configuration}

with open(os.path.join(working_dir, f"{job_name}.json"), 'w') as desc_file:
    json.dump(job_desc, desc_file)
    
estimator.fit(channels, wait=True, job_name=job_name)

2021-06-29 15:56:05 Starting - Starting the training job...
2021-06-29 15:56:07 Starting - Launching requested ML instances.........
2021-06-29 15:57:46 Starting - Preparing the instances for training.........
2021-06-29 15:59:25 Downloading - Downloading input data...................................................
2021-06-29 16:07:43 Training - Downloading the training image..............................
2021-06-29 16:13:01 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34msed: can't read changehostname.c: No such file or directory[0m
[34mgcc: error: changehostname.c: No such file or directory[0m
[34mgcc: fatal error: no input files[0m
[34mcompilation terminated.[0m
[34mgcc: error: changehostname.o: No such file or directory[0m
[34mERROR: ld.so: object '/libchangehostname.so' from LD_PRELOAD cannot be preloaded (cannot open 

UnexpectedStatusException: Error for Training job jbsnyder-29-06-2021-15-56-05: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "mpirun --host algo-1:8,algo-2:8,algo-3:8,algo-4:8 -np 32 --allow-run-as-root --tag-output --oversubscribe -mca btl_tcp_if_include eth0 -mca oob_tcp_if_include eth0 -mca plm_rsh_no_tree_spawn 1 -mca pml ob1 -mca btl ^openib -mca orte_abort_on_non_zero_status 1 -mca btl_vader_single_copy_mechanism none -mca plm_rsh_num_concurrent 4 -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH -x SMDATAPARALLEL_USE_HOMOGENEOUS=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 -x LD_PRELOAD=/opt/conda/lib/python3.6/site-packages/gethostname.cpython-36m-x86_64-linux-gnu.so -x SMDATAPARALLEL_SERVER_ADDR=algo-1 -x SMDATAPARALLEL_SERVER_PORT=7592 -x SAGEMAKER_INSTANCE_TYPE=ml.p3.16xlarge smddprun /opt/conda/bin/python3.6 -m mpi4py train.py --config configs/smd_crcnn_bs256.yaml --unarchive /opt/ml/input/data/coco/"
ERROR: ld.so: object '/libchangehostname.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/libchan