Skip to content

Commit

Permalink
Updating scripts to work better on DLAMI and train on 256GPUs in 14mi…
Browse files Browse the repository at this point in the history
…nutes (#5)

Updating ResNet Tensorflow scripts
  • Loading branch information
rahul003 committed Jan 4, 2019
1 parent 0c297fd commit 599adf2
Show file tree
Hide file tree
Showing 9 changed files with 450 additions and 127 deletions.
62 changes: 62 additions & 0 deletions models/resnet/tensorflow/dlami_scripts/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# Specify hosts in the file `hosts`, ensure that the number of slots is equal to the number of GPUs on that host

# Use train_more_aug.sh when training with large number of GPUs (128, 256, etc). That script uses more augmentations and layer wise adaptive rate control (LARC) to help with convergence at large batch sizes.

# This script has been tested on DLAMI v17 and above

if [ -z "$1" ]
then
echo "Usage: "$0" <num_gpus>"
exit 1
else
gpus=$1
fi

function runclust(){ while read -u 10 host; do host=${host%% slots*}; if [ ""$3"" == "verbose" ]; then echo "On $host"; fi; ssh -o "StrictHostKeyChecking no" $host ""$2""; done 10<$1; };

# Activating tensorflow_p36 on each machine
runclust hosts "echo 'Activating tensorflow_p36'; tmux new-session -s activation_tf -d \"source activate tensorflow_p36 > activation_log.txt;\"" verbose;
# Waiting for activation to finish
runclust hosts "while tmux has-session -t activation_tf 2>/dev/null; do :; done; cat activation_log.txt"
# You can comment out the above two runclust commands if you have activated the environment on all machines at least once

# Activate locally for the mpirun command to use
source activate tensorflow_p36

echo "Launching training job using $gpus GPUs"
set -ex

# use ens3 interface for DLAMI Ubuntu and eth0 interface for DLAMI AmazonLinux
if [ -n "$(uname -a | grep Ubuntu)" ]; then INTERFACE=ens3 ; else INTERFACE=eth0; fi
NUM_GPUS_MASTER=`nvidia-smi -L | wc -l`

# p3 instances have larger GPU memory, so a higher batch size can be used
GPU_MEM=`nvidia-smi --query-gpu=memory.total --format=csv,noheader -i 0 | awk '{print $1}'`
if [ $GPU_MEM -gt 15000 ] ; then BATCH_SIZE=256; else BATCH_SIZE=128; fi

# Training
~/anaconda3/envs/tensorflow_p36/bin/mpirun -np $gpus -hostfile hosts -mca plm_rsh_no_tree_spawn 1 \
-bind-to socket -map-by slot \
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \
-x TF_CPP_MIN_LOG_LEVEL=0 \
python -W ignore train_imagenet_resnet_hvd.py \
--data_dir ~/data/tf-imagenet/ --num_epochs 90 -b $BATCH_SIZE \
--lr_decay_mode poly --warmup_epochs 10 --clear_log

# Evaluation
# Using only master node for evaluation as we saved checkpoints only on master node
# pass num_gpus it was trained on to print the epoch numbers correctly
~/anaconda3/envs/tensorflow_p36/bin/mpirun -np $NUM_GPUS_MASTER -mca plm_rsh_no_tree_spawn 1 \
-bind-to socket -map-by slot \
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \
-x TF_CPP_MIN_LOG_LEVEL=0 \
python -W ignore train_imagenet_resnet_hvd.py \
--data_dir ~/data/tf-imagenet/ --num_epochs 90 -b $BATCH_SIZE \
--eval --num_gpus $gpus
34 changes: 0 additions & 34 deletions models/resnet/tensorflow/dlami_scripts/train_64gpus.sh

This file was deleted.

20 changes: 0 additions & 20 deletions models/resnet/tensorflow/dlami_scripts/train_8gpus_synthetic.sh

This file was deleted.

65 changes: 65 additions & 0 deletions models/resnet/tensorflow/dlami_scripts/train_more_aug.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# Specify hosts in the file `hosts`, ensure that the number of slots is equal to the number of GPUs on that host

# Use this script when training with large number of GPUs (128, 256, etc). It uses more augmentations than train.sh, and also uses layer wise adaptive rate control (LARC) to help with convergence at large batch sizes.

# This script has been tested on DLAMI v17 and above

if [ -z "$1" ]
then
echo "Usage: "$0" <num_gpus>"
exit 1
else
gpus=$1
fi

function runclust(){ while read -u 10 host; do host=${host%% slots*}; if [ ""$3"" == "verbose" ]; then echo "On $host"; fi; ssh -o "StrictHostKeyChecking no" $host ""$2""; done 10<$1; };

# Activating tensorflow_p36 on each machine
runclust hosts "echo 'Activating tensorflow_p36'; tmux new-session -s activation_tf -d \"source activate tensorflow_p36 > activation_log.txt;\"" verbose;
# Waiting for activation to finish
runclust hosts "while tmux has-session -t activation_tf 2>/dev/null; do :; done; cat activation_log.txt"
# You can comment out the above two runclust commands if you have activated the environment on all machines at least once

# Activate locally for the mpirun command to use
source activate tensorflow_p36

echo "Launching training job using $gpus GPUs"
set -ex

# use ens3 interface for DLAMI Ubuntu and eth0 interface for DLAMI AmazonLinux
if [ -n "$(uname -a | grep Ubuntu)" ]; then INTERFACE=ens3 ; else INTERFACE=eth0; fi
NUM_GPUS_MASTER=`nvidia-smi -L | wc -l`

# p3 instances have larger GPU memory, so a higher batch size can be used
GPU_MEM=`nvidia-smi --query-gpu=memory.total --format=csv,noheader -i 0 | awk '{print $1}'`
if [ $GPU_MEM -gt 15000 ] ; then BATCH_SIZE=256; else BATCH_SIZE=128; fi

# Training
# This script is for training with large number of GPUs (large batch sizes).
# You can for instance just replace the number of GPUs to 128 with the same script.
~/anaconda3/envs/tensorflow_p36/bin/mpirun -np $gpus -hostfile hosts -mca plm_rsh_no_tree_spawn 1 \
-bind-to socket -map-by slot \
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \
-x TF_CPP_MIN_LOG_LEVEL=0 \
python -W ignore train_imagenet_resnet_hvd.py \
--data_dir ~/data/tf-imagenet/ --num_epochs 90 --increased_aug -b $BATCH_SIZE \
--mom 0.977 --wdecay 0.0005 --loss_scale 256. --use_larc \
--lr_decay_mode linear_cosine --warmup_epochs 5 --clear_log

# Evaluation
# Using only gpus on master node for evaluation as we saved checkpoints only on master node
# pass num_gpus it was trained on to print the epoch numbers correctly
~/anaconda3/envs/tensorflow_p36/bin/mpirun -np $NUM_GPUS_MASTER -mca plm_rsh_no_tree_spawn 1 \
-bind-to socket -map-by slot \
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \
-x TF_CPP_MIN_LOG_LEVEL=0 \
python -W ignore train_imagenet_resnet_hvd.py \
--data_dir ~/data/tf-imagenet/ --num_epochs 90 -b $BATCH_SIZE \
--eval --num_gpus $gpus
46 changes: 46 additions & 0 deletions models/resnet/tensorflow/dlami_scripts/train_synthetic.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# Specify hosts in the file `hosts`, ensure that the number of slots is equal to the number of GPUs on that host

# This script has been tested on DLAMI v17 and above

if [ -z "$1" ]
then
echo "Usage: "$0" <num_gpus>"
exit 1
else
gpus=$1
fi

function runclust(){ while read -u 10 host; do host=${host%% slots*}; if [ ""$3"" == "verbose" ]; then echo "On $host"; fi; ssh -o "StrictHostKeyChecking no" $host ""$2""; done 10<$1; };

# Activating tensorflow_p36 on each machine
runclust hosts "echo 'Activating tensorflow_p36'; tmux new-session -s activation_tf -d \"source activate tensorflow_p36 > activation_log.txt;\"" verbose;
# Waiting for activation to finish
runclust hosts "while tmux has-session -t activation_tf 2>/dev/null; do :; done; cat activation_log.txt"
# You can comment out the above two runclust commands if you have activated the environment on all machines at least once

# Activate locally for the mpirun command to use
source activate tensorflow_p36

echo "Launching training job with synthetic data using $gpus GPUs"
set -ex

# use ens3 interface for DLAMI Ubuntu and eth0 interface for DLAMI AmazonLinux
if [ -n "$(uname -a | grep Ubuntu)" ]; then INTERFACE=ens3 ; else INTERFACE=eth0; fi
if [ "$gpus" -ge 128 ]; then LARC_AND_SCALING=" --use_larc --loss_scale 256." ; else LARC_AND_SCALING=""; fi

# p3 instances have larger GPU memory, so a higher batch size can be used
GPU_MEM=`nvidia-smi --query-gpu=memory.total --format=csv,noheader -i 0 | awk '{print $1}'`
if [ $GPU_MEM -gt 15000 ] ; then BATCH_SIZE=256; else BATCH_SIZE=128; fi

# Training
~/anaconda3/envs/tensorflow_p36/bin/mpirun -np $gpus -hostfile hosts -mca plm_rsh_no_tree_spawn 1 \
-bind-to socket -map-by slot \
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \
-x TF_CPP_MIN_LOG_LEVEL=0 \
python -W ignore train_imagenet_resnet_hvd.py \
--synthetic -b $BATCH_SIZE --num_epochs 5 --clear_log $LARC_AND_SCALING
44 changes: 44 additions & 0 deletions models/resnet/tensorflow/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# Ensure you have Horovod, OpenMPI, and Tensorflow installed on each machine
# Specify hosts in the file `hosts`

if [ -z "$1" ]
then
echo "Usage: "$0" <num_gpus>"
exit 1
else
gpus=$1
fi

echo "Launching training job using $gpus GPUs"
set -ex

# p3 instances have larger GPU memory, so a higher batch size can be used
GPU_MEM=`nvidia-smi --query-gpu=memory.total --format=csv,noheader -i 0 | awk '{print $1}'`
if [ $GPU_MEM -gt 15000 ] ; then BATCH_SIZE=256; else BATCH_SIZE=128; fi

# Training
mpirun -np $gpus -hostfile hosts -mca plm_rsh_no_tree_spawn 1 \
-bind-to socket -map-by slot \
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \
-x TF_CPP_MIN_LOG_LEVEL=0 \
python -W ignore train_imagenet_resnet_hvd.py \
--data_dir ~/data/tf-imagenet/ --num_epochs 90 -b $BATCH_SIZE \
--lr_decay_mode poly --warmup_epochs 10 --clear_log

NUM_GPUS_MASTER=`nvidia-smi -L | wc -l`

# Evaluation
# Using only the gpus on master node for evaluation as we saved checkpoints only on master node
# pass num_gpus it was trained on to print the epoch numbers correctly
mpirun -np $NUM_GPUS_MASTER -mca plm_rsh_no_tree_spawn 1 \
-bind-to socket -map-by slot \
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \
-x TF_CPP_MIN_LOG_LEVEL=0 \
python -W ignore train_imagenet_resnet_hvd.py \
--data_dir ~/data/tf-imagenet/ --num_epochs 90 \
--eval --num_gpus $gpus
13 changes: 0 additions & 13 deletions models/resnet/tensorflow/train_64gpus.sh

This file was deleted.

Loading

0 comments on commit 599adf2

Please sign in to comment.