## Nvidia MLPerf on SageMaker

In [None]:
import os
from datetime import datetime
import boto3
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

In [None]:
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

region = boto3.session.Session().region_name
boto_sess = boto3.Session()
sm = boto_sess.client('sagemaker')

s3_bucket = "s3://jbsnyder-sagemaker-us-east/"

base_job_name = "jbsnyder-mlperf-mrcnn"
date_str = datetime.now().strftime("%d-%m-%Y")
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
job_name = f"{base_job_name}-{time_str}"

output_path = os.path.join(s3_bucket, "sagemaker-output", date_str, job_name)
code_location = os.path.join(s3_bucket, "sagemaker-code", date_str, job_name)

In [None]:
instance_type = 'ml.p4d.24xlarge'
instance_count = 1
repo = "jbsnyder"
tag = "pytorch-mrcnn"
#account = os.popen(f"aws sts get-caller-identity --region {region} --endpoint-url https://sts.{region}.amazonaws.com --query Account --output text").read().strip()
#image_uri = f"{account}.dkr.ecr.{region}.amazonaws.com/{repo}:{tag}"
image_uri = "account.dkr.ecr.us-east-1.amazonaws.com/jbsnyder:pytorch-mrcnn"

In [None]:
dist_strategy = "nccl"
# assert dist_strategy in ["smddp", "nccl"]
# For now, just using NCCL not SMDDP
assert dist_strategy in ["nccl"]
hyperparameters = {"config-file": f'configs/e2e_mask_rcnn_R_50_FPN_1x_{instance_count}_node.yaml'}

In [None]:
if dist_strategy=="nccl":
    distribution=None
    entry_point="launch_ddp.py"
    # hyperparameters['training_script']="aws_train_mlperf.py"
else:
    distribution={ "smdistributed": { "dataparallel": { "enabled": True } } }
    entry_point = "aws_train_mlperf.py"

In [None]:
channels = {"all_data": os.path.join(s3_bucket, "data", "yolo/"),
            "annotations": os.path.join(s3_bucket, "data", "coco", "annotations/"),
            "weights": os.path.join(s3_bucket, "data", "weights", "pt-resnet/")}

In [None]:
estimator = PyTorch(
    source_dir="./src",
    entry_point=entry_point,
    base_job_name=job_name,
    role=get_execution_role(),
    instance_count=instance_count,
    instance_type=instance_type,
    distribution=distribution,
    # volume_size=400, # Not necessary for P4d
    max_run=7200,
    hyperparameters=hyperparameters,
    image_uri=image_uri,
    output_path=os.path.join(output_path, 'training-output'),
    checkpoint_s3_uri=os.path.join(output_path, 'training-checkpoints'),
    model_dir=os.path.join(output_path, 'training-model'),
    code_location=code_location,
    input_mode='File',
)

In [None]:
estimator.fit(
    inputs=channels,
    wait=False,
    job_name=job_name,
)

In [None]:
estimator.logs()