## SageMakerCV

This notebook launches a SageMakerCV training job for PyTorch.

In [1]:
import os
import subprocess
from datetime import datetime

from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch

In [134]:
# Set the default region
region = "us-west-2"
os.environ['AWS_DEFAULT_REGION'] = region

# Set a user ID - This is just used for naming your job, so can be anything you like.
# The date_str is used for organizing your jobs in your S3 bucket
# The time_str is used for keeping track of job names
# The ecr_repo is the ECR repo that contains your SageMakerCV Docker image.
# If you haven't created a SageMakerCV Docker image, see the instructions here.
# The algo_name is the name of your Docker image in your ECR repo
# The account_call is a subprocess command to get the AWS account associated with
# your local AWS credentials. This is used to get the account asssociated with your
# ECR repo.
# instance_type is the type of sagemaker instance you want to use for training.
# the config_file contains the model and training configuration in yaml format.
# The s3_bucket is the bucket that contains your data, and will also be used for storing results
user_id = "jbsnyder"
date_str = datetime.now().strftime("%d-%m-%Y")
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
ecr_repo = "jbsnyder"
algo_name = "smcv-pt-1.8"
instance_type = "ml.p4d.24xlarge"
nodes = 4
config_file = "configs/mrcnn_bs384_O4.yaml"
s3_bucket = "s3://jbsnyder-sagemaker-pdx/"

account_call = f"aws sts get-caller-identity --region {region} --endpoint-url https://sts.{region}.amazonaws.com --query Account --output text"
ecr_account = subprocess.check_output(account_call, shell=True).decode().strip()

docker_image = "{0}.dkr.ecr.{1}.amazonaws.com/{2}:{3}".format(ecr_account,
                                                              region,
                                                              ecr_repo,
                                                              algo_name)

role = get_execution_role()

In [135]:
instance_type_short = '-'.join(instance_type.split('.')[1:]).replace('large', '')
processes_per_host = 8 if instance_type in ['ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge'] \
                    else 4 if instance_type in ['ml.p3.8xlarge', 'ml.g4dn.12xlarge'] else 1
config_info = config_file.split('/')[1].replace('.yaml', '').replace('_', '-')

source_dir = "."

if nodes>1 and instance_type in ['ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge']:
    distribution = { "smdistributed": { "dataparallel": { "enabled": True } } } 
    main_script = "train.py"
else:
    distribution = None
    main_script = "launch_torch.py"

job_name = f'{user_id}-{config_info}-{instance_type_short}-{time_str}'

output_path = os.path.join(s3_bucket, "sagemaker-output", date_str, job_name)

code_location = os.path.join(s3_bucket, "sagemaker-code", date_str, job_name)

s3_data_dir = os.path.join(s3_bucket, "data")
s3_coco_dir = "coco/2017/archive/"
s3_weights_dir = "weights/pytorch/resnet/"

channels = {
    'coco': os.path.join(s3_data_dir, s3_coco_dir),
    'weights': os.path.join(s3_data_dir, s3_weights_dir),
}

hyperparameters = {"config": config_file,
                   "unarchive": '/opt/ml/input/data/coco/'}

In [136]:
estimator = PyTorch(
                entry_point=main_script, 
                source_dir=source_dir, 
                image_uri=docker_image, 
                role=role,
                instance_count=nodes,
                instance_type=instance_type,
                distribution=distribution,
                output_path=output_path,
                checkpoint_s3_uri=output_path,
                model_dir=output_path,
                hyperparameters=hyperparameters,
                volume_size=500,
                disable_profiler=True,
                debugger_hook_config=False,
                code_location=code_location
)

In [137]:
estimator.fit(channels, wait=False, job_name=job_name)