## Horovod Distributed Training with Script Mode.

### Setup 

In [49]:
import sagemaker
import os
from sagemaker.utils import sagemaker_timestamp
from sagemaker.tensorflow import TensorFlow

sage_session = sagemaker.Session()

from sagemaker import get_execution_role
role = get_execution_role()


account = sage_session.boto_session.client('sts').get_caller_identity()['Account']
region = sage_session.boto_session.region_name

image_name = "sagemaker-horovod-distributed-training-3"
ecr_image_url = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region,image_name)


### Build your horovod container

In [50]:
%%script env region=$region image_name=$image_name bash

echo "Building docker image locally with image name: $image_name region: $region"

# Sagemaker Registry account id
sagemaker_registry_account=520713654638

# Get the login command from ECR and execute it directly with registy id of sagemaker to extend the sagemaker TF container.
$(aws ecr get-login --region ${region} --no-include-email --registry-ids ${sagemaker_registry_account})

# Build the docker image locally with the image name and then push it to ECR.

# On a SageMaker Notebook Instance, the docker daemon may need to be restarted in order
# to detect your network configuration correctly.  (This is a known issue.)
if [ -d "/home/ec2-user/SageMaker" ]; then
  sudo service docker restart
fi

cd ../ && docker build -t ${image_name}:latest --build-arg region=${region} -f docker/Dockerfile.cpu .
    

Building docker image locally with image name: sagemaker-horovod-distributed-training-3 region: us-west-2
Login Succeeded
Sending build context to Docker daemon  648.1MB
Step 1/22 : ARG region
Step 2/22 : FROM 520713654638.dkr.ecr.$region.amazonaws.com/sagemaker-tensorflow-scriptmode:1.11.0-cpu-py3
 ---> 17315c52418a
Step 3/22 : RUN echo $region
 ---> Using cache
 ---> fe5d9732736e
Step 4/22 : RUN buildDeps="         wget         build-essential     "     && apt-get update && apt-get install -y --no-install-recommends $buildDeps     && apt-get clean     && rm -rf /var/lib/apt/lists/*
 ---> Using cache
 ---> f0582b469428
Step 5/22 : RUN mkdir /tmp/openmpi &&     cd /tmp/openmpi &&     wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz &&     tar zxf openmpi-3.0.0.tar.gz &&     cd openmpi-3.0.0 &&     ./configure --enable-orterun-prefix-by-default &&     make -j $(nproc) all &&     make install &&     ldconfig &&     rm -rf /tmp/openmpi
 ---> Using cache
 -



## Push container to ECR Repository

In [51]:
%%script env account=$account region=$region image_name=$image_name ecr_image_url=$ecr_image_url bash

echo "Pushing locally built container to ECR Repository: $ecr_image_url in region: $region on account: $account"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${image_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    echo "Creating a new ECR repository with name: $image_name"
    aws ecr create-repository --repository-name "${image_name}" > /dev/null
fi
    
`aws ecr get-login --no-include-email --region ${region}`

# Tag Docker image with ECR Url
docker tag ${image_name}:latest ${ecr_image_url}

docker push ${ecr_image_url}

Pushing locally built container to ECR Repository: 964029418868.dkr.ecr.us-west-2.amazonaws.com/sagemaker-horovod-distributed-training-3:latest in region: us-west-2 on account: 964029418868
Login Succeeded
The push refers to repository [964029418868.dkr.ecr.us-west-2.amazonaws.com/sagemaker-horovod-distributed-training-3]
f6eb9e5c770f: Preparing
3680749cb7a0: Preparing
aeae555763dc: Preparing
2b8e8a00654a: Preparing
5bbd8e9cabab: Preparing
4ec6682fc14e: Preparing
8d48a09d9821: Preparing
9552ffea3be9: Preparing
2accf753bc6e: Preparing
55151b854b91: Preparing
fd4655c217ad: Preparing
703f7275ec5e: Preparing
159af601bde7: Preparing
9d82ea0cc0c8: Preparing
a7024ca0e69b: Preparing
296294040c48: Preparing
302af95b9e68: Preparing
0ecacb4d5bb1: Preparing
d6db7d05e3b2: Preparing
8d999119430c: Preparing
fe4ed9a0e78a: Preparing
3db5746c911a: Preparing
819a824caf70: Preparing
647265b9d8bc: Preparing
41c002c8a6fd: Preparing
f6eb9e5c770f: Waiting
3680749cb7a0: Waiting
aeae555763dc: Waiting
2b8e8a0065



### Prepare train and test data

In [52]:

def _get_train_test_data(data_path, sagemaker_session):

    prefix = 'tf_mnist/{}'.format(sagemaker_timestamp())
    train_data_path = os.path.join(data_path, 'train')
    key_prefix = prefix + '/train'
    train_input = sagemaker_session.upload_data(path=train_data_path, key_prefix=key_prefix)
    test_path = os.path.join(data_path, 'test')
    test_input = sagemaker_session.upload_data(path=test_path, key_prefix=prefix + '/test')

    return test_input, train_input

source_dir = os.path.join('../src')
data_path = os.path.join(source_dir, 'data')

test_input, train_input = _get_train_test_data(data_path, sage_session)

### Train it with Horovod

In [54]:

instance_count = 12
estimator = TensorFlow(entry_point="horovod_launcher.py",
                       role=role,
                       training_steps=1,
                       evaluation_steps=1,
                       train_instance_count=instance_count,
                       train_instance_type="ml.c4.xlarge",
                       sagemaker_session=sage_session,
                       image_name=ecr_image_url,
                       base_job_name="tf-horovod-{}x".format(str(instance_count)),
                       source_dir=source_dir)

estimator.fit({'train': train_input, 'test': test_input})

INFO:sagemaker:Creating training-job with name: tf-horovod-12x-2018-11-25-11-19-08-954


2018-11-25 11:19:28 Starting - Starting the training job...
2018-11-25 11:19:31 Starting - Launching requested ML instances......
2018-11-25 11:20:39 Starting - Preparing the instances for training......
2018-11-25 11:21:51 Downloading - Downloading input data...
2018-11-25 11:22:06 Training - Downloading the training image..
[36m2018-11-25 11:22:40,335 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[36m2018-11-25 11:22:40,338 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[36m2018-11-25 11:22:40,753 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[36m2018-11-25 11:22:40,765 sagemaker-containers INFO     Invoking user script
[0m
[36mTraining Env:
[0m
[36m{
    "additional_framework_parameters": {
        "sagemaker_requirements": ""
    },
    "channel_input_dirs": {
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    "cu

[33m2018-11-25 11:22:41,480 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[33m2018-11-25 11:22:41,484 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[33m2018-11-25 11:22:41,825 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[33m2018-11-25 11:22:41,836 sagemaker-containers INFO     Invoking user script
[0m
[33mTraining Env:
[0m
[33m{
    "additional_framework_parameters": {
        "sagemaker_requirements": ""
    },
    "channel_input_dirs": {
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-6",
    "framework_module": "sagemaker_tensorflow_container.training:main",
    "hosts": [
        "algo-1",
        "algo-2",
        "algo-3",
        "algo-4",
        "algo-5",
        "algo-6",
        "algo-7",
        "algo-8",
        "algo-9",
        "algo-10",
        "algo-11",
        "algo-12"


[31mhosts that aren't SSHable yet: %s ['algo-7', 'algo-8', 'algo-10', 'algo-11'][0m
[31m>>> Host: algo-7 is sshable now.[0m
[35m2018-11-25 11:22:45,035 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[35m2018-11-25 11:22:45,037 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[35m2018-11-25 11:22:45,413 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[35m2018-11-25 11:22:45,425 sagemaker-containers INFO     Invoking user script
[0m
[35mTraining Env:
[0m
[35m{
    "additional_framework_parameters": {
        "sagemaker_requirements": ""
    },
    "channel_input_dirs": {
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-8",
    "framework_module": "sagemaker_tensorflow_container.training:main",
    "hosts": [
        "algo-1",
        "algo-2",
        "algo-3",
        "algo-4",
        "algo-5",


[31mEnv Hosts: ['algo-1', 'algo-2', 'algo-3', 'algo-4', 'algo-5', 'algo-6', 'algo-7', 'algo-8', 'algo-9', 'algo-10', 'algo-11', 'algo-12'] Hosts: ['algo-1', 'algo-2', 'algo-3', 'algo-4', 'algo-5', 'algo-6', 'algo-7', 'algo-8', 'algo-9', 'algo-10', 'algo-11', 'algo-12'] process_per_hosts: 1 num_processes: 12[0m
[31mnetwork interface name: %s ethwe[0m
[31mMPI Command: mpirun --host algo-1,algo-2,algo-3,algo-4,algo-5,algo-6,algo-7,algo-8,algo-9,algo-10,algo-11,algo-12 -np 12  --allow-run-as-root --display-map --tag-output -mca btl_tcp_if_include ethwe -mca oob_tcp_if_include ethwe -x NCCL_SOCKET_IFNAME=ethwe --mca plm_rsh_no_tree_spawn 1 -mca orte_abort_on_non_zero_status 1 -x NCCL_MIN_NRINGS=8 -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH -x LD_PRELOAD=/libchangehostname.so -x SM_HOSTS="["algo-1","algo-2","algo-3","algo-4","algo-5","algo-6","algo-7","algo-8","algo-9","algo-10","algo-11","algo-12"]" -x SM_NETWORK_INTERFACE_NAME="ethwe" -x SM_HPS="{"checkpoint_path":"s3://sagemaker-us


2018-11-25 11:22:46 Training - Training image download completed. Training in progress.

RuntimeError: generator raised StopIteration