-
Notifications
You must be signed in to change notification settings - Fork 79
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updating scripts to work better on DLAMI and train on 256GPUs in 14mi…
…nutes (#5) Updating ResNet Tensorflow scripts
- Loading branch information
Showing
9 changed files
with
450 additions
and
127 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
# Specify hosts in the file `hosts`, ensure that the number of slots is equal to the number of GPUs on that host | ||
|
||
# Use train_more_aug.sh when training with large number of GPUs (128, 256, etc). That script uses more augmentations and layer wise adaptive rate control (LARC) to help with convergence at large batch sizes. | ||
|
||
# This script has been tested on DLAMI v17 and above | ||
|
||
if [ -z "$1" ] | ||
then | ||
echo "Usage: "$0" <num_gpus>" | ||
exit 1 | ||
else | ||
gpus=$1 | ||
fi | ||
|
||
function runclust(){ while read -u 10 host; do host=${host%% slots*}; if [ ""$3"" == "verbose" ]; then echo "On $host"; fi; ssh -o "StrictHostKeyChecking no" $host ""$2""; done 10<$1; }; | ||
|
||
# Activating tensorflow_p36 on each machine | ||
runclust hosts "echo 'Activating tensorflow_p36'; tmux new-session -s activation_tf -d \"source activate tensorflow_p36 > activation_log.txt;\"" verbose; | ||
# Waiting for activation to finish | ||
runclust hosts "while tmux has-session -t activation_tf 2>/dev/null; do :; done; cat activation_log.txt" | ||
# You can comment out the above two runclust commands if you have activated the environment on all machines at least once | ||
|
||
# Activate locally for the mpirun command to use | ||
source activate tensorflow_p36 | ||
|
||
echo "Launching training job using $gpus GPUs" | ||
set -ex | ||
|
||
# use ens3 interface for DLAMI Ubuntu and eth0 interface for DLAMI AmazonLinux | ||
if [ -n "$(uname -a | grep Ubuntu)" ]; then INTERFACE=ens3 ; else INTERFACE=eth0; fi | ||
NUM_GPUS_MASTER=`nvidia-smi -L | wc -l` | ||
|
||
# p3 instances have larger GPU memory, so a higher batch size can be used | ||
GPU_MEM=`nvidia-smi --query-gpu=memory.total --format=csv,noheader -i 0 | awk '{print $1}'` | ||
if [ $GPU_MEM -gt 15000 ] ; then BATCH_SIZE=256; else BATCH_SIZE=128; fi | ||
|
||
# Training | ||
~/anaconda3/envs/tensorflow_p36/bin/mpirun -np $gpus -hostfile hosts -mca plm_rsh_no_tree_spawn 1 \ | ||
-bind-to socket -map-by slot \ | ||
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \ | ||
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ | ||
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \ | ||
-x TF_CPP_MIN_LOG_LEVEL=0 \ | ||
python -W ignore train_imagenet_resnet_hvd.py \ | ||
--data_dir ~/data/tf-imagenet/ --num_epochs 90 -b $BATCH_SIZE \ | ||
--lr_decay_mode poly --warmup_epochs 10 --clear_log | ||
|
||
# Evaluation | ||
# Using only master node for evaluation as we saved checkpoints only on master node | ||
# pass num_gpus it was trained on to print the epoch numbers correctly | ||
~/anaconda3/envs/tensorflow_p36/bin/mpirun -np $NUM_GPUS_MASTER -mca plm_rsh_no_tree_spawn 1 \ | ||
-bind-to socket -map-by slot \ | ||
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \ | ||
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ | ||
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \ | ||
-x TF_CPP_MIN_LOG_LEVEL=0 \ | ||
python -W ignore train_imagenet_resnet_hvd.py \ | ||
--data_dir ~/data/tf-imagenet/ --num_epochs 90 -b $BATCH_SIZE \ | ||
--eval --num_gpus $gpus |
This file was deleted.
Oops, something went wrong.
20 changes: 0 additions & 20 deletions
20
models/resnet/tensorflow/dlami_scripts/train_8gpus_synthetic.sh
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
# Specify hosts in the file `hosts`, ensure that the number of slots is equal to the number of GPUs on that host | ||
|
||
# Use this script when training with large number of GPUs (128, 256, etc). It uses more augmentations than train.sh, and also uses layer wise adaptive rate control (LARC) to help with convergence at large batch sizes. | ||
|
||
# This script has been tested on DLAMI v17 and above | ||
|
||
if [ -z "$1" ] | ||
then | ||
echo "Usage: "$0" <num_gpus>" | ||
exit 1 | ||
else | ||
gpus=$1 | ||
fi | ||
|
||
function runclust(){ while read -u 10 host; do host=${host%% slots*}; if [ ""$3"" == "verbose" ]; then echo "On $host"; fi; ssh -o "StrictHostKeyChecking no" $host ""$2""; done 10<$1; }; | ||
|
||
# Activating tensorflow_p36 on each machine | ||
runclust hosts "echo 'Activating tensorflow_p36'; tmux new-session -s activation_tf -d \"source activate tensorflow_p36 > activation_log.txt;\"" verbose; | ||
# Waiting for activation to finish | ||
runclust hosts "while tmux has-session -t activation_tf 2>/dev/null; do :; done; cat activation_log.txt" | ||
# You can comment out the above two runclust commands if you have activated the environment on all machines at least once | ||
|
||
# Activate locally for the mpirun command to use | ||
source activate tensorflow_p36 | ||
|
||
echo "Launching training job using $gpus GPUs" | ||
set -ex | ||
|
||
# use ens3 interface for DLAMI Ubuntu and eth0 interface for DLAMI AmazonLinux | ||
if [ -n "$(uname -a | grep Ubuntu)" ]; then INTERFACE=ens3 ; else INTERFACE=eth0; fi | ||
NUM_GPUS_MASTER=`nvidia-smi -L | wc -l` | ||
|
||
# p3 instances have larger GPU memory, so a higher batch size can be used | ||
GPU_MEM=`nvidia-smi --query-gpu=memory.total --format=csv,noheader -i 0 | awk '{print $1}'` | ||
if [ $GPU_MEM -gt 15000 ] ; then BATCH_SIZE=256; else BATCH_SIZE=128; fi | ||
|
||
# Training | ||
# This script is for training with large number of GPUs (large batch sizes). | ||
# You can for instance just replace the number of GPUs to 128 with the same script. | ||
~/anaconda3/envs/tensorflow_p36/bin/mpirun -np $gpus -hostfile hosts -mca plm_rsh_no_tree_spawn 1 \ | ||
-bind-to socket -map-by slot \ | ||
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \ | ||
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ | ||
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \ | ||
-x TF_CPP_MIN_LOG_LEVEL=0 \ | ||
python -W ignore train_imagenet_resnet_hvd.py \ | ||
--data_dir ~/data/tf-imagenet/ --num_epochs 90 --increased_aug -b $BATCH_SIZE \ | ||
--mom 0.977 --wdecay 0.0005 --loss_scale 256. --use_larc \ | ||
--lr_decay_mode linear_cosine --warmup_epochs 5 --clear_log | ||
|
||
# Evaluation | ||
# Using only gpus on master node for evaluation as we saved checkpoints only on master node | ||
# pass num_gpus it was trained on to print the epoch numbers correctly | ||
~/anaconda3/envs/tensorflow_p36/bin/mpirun -np $NUM_GPUS_MASTER -mca plm_rsh_no_tree_spawn 1 \ | ||
-bind-to socket -map-by slot \ | ||
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \ | ||
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ | ||
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \ | ||
-x TF_CPP_MIN_LOG_LEVEL=0 \ | ||
python -W ignore train_imagenet_resnet_hvd.py \ | ||
--data_dir ~/data/tf-imagenet/ --num_epochs 90 -b $BATCH_SIZE \ | ||
--eval --num_gpus $gpus |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
# Specify hosts in the file `hosts`, ensure that the number of slots is equal to the number of GPUs on that host | ||
|
||
# This script has been tested on DLAMI v17 and above | ||
|
||
if [ -z "$1" ] | ||
then | ||
echo "Usage: "$0" <num_gpus>" | ||
exit 1 | ||
else | ||
gpus=$1 | ||
fi | ||
|
||
function runclust(){ while read -u 10 host; do host=${host%% slots*}; if [ ""$3"" == "verbose" ]; then echo "On $host"; fi; ssh -o "StrictHostKeyChecking no" $host ""$2""; done 10<$1; }; | ||
|
||
# Activating tensorflow_p36 on each machine | ||
runclust hosts "echo 'Activating tensorflow_p36'; tmux new-session -s activation_tf -d \"source activate tensorflow_p36 > activation_log.txt;\"" verbose; | ||
# Waiting for activation to finish | ||
runclust hosts "while tmux has-session -t activation_tf 2>/dev/null; do :; done; cat activation_log.txt" | ||
# You can comment out the above two runclust commands if you have activated the environment on all machines at least once | ||
|
||
# Activate locally for the mpirun command to use | ||
source activate tensorflow_p36 | ||
|
||
echo "Launching training job with synthetic data using $gpus GPUs" | ||
set -ex | ||
|
||
# use ens3 interface for DLAMI Ubuntu and eth0 interface for DLAMI AmazonLinux | ||
if [ -n "$(uname -a | grep Ubuntu)" ]; then INTERFACE=ens3 ; else INTERFACE=eth0; fi | ||
if [ "$gpus" -ge 128 ]; then LARC_AND_SCALING=" --use_larc --loss_scale 256." ; else LARC_AND_SCALING=""; fi | ||
|
||
# p3 instances have larger GPU memory, so a higher batch size can be used | ||
GPU_MEM=`nvidia-smi --query-gpu=memory.total --format=csv,noheader -i 0 | awk '{print $1}'` | ||
if [ $GPU_MEM -gt 15000 ] ; then BATCH_SIZE=256; else BATCH_SIZE=128; fi | ||
|
||
# Training | ||
~/anaconda3/envs/tensorflow_p36/bin/mpirun -np $gpus -hostfile hosts -mca plm_rsh_no_tree_spawn 1 \ | ||
-bind-to socket -map-by slot \ | ||
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \ | ||
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ | ||
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \ | ||
-x TF_CPP_MIN_LOG_LEVEL=0 \ | ||
python -W ignore train_imagenet_resnet_hvd.py \ | ||
--synthetic -b $BATCH_SIZE --num_epochs 5 --clear_log $LARC_AND_SCALING |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
# Ensure you have Horovod, OpenMPI, and Tensorflow installed on each machine | ||
# Specify hosts in the file `hosts` | ||
|
||
if [ -z "$1" ] | ||
then | ||
echo "Usage: "$0" <num_gpus>" | ||
exit 1 | ||
else | ||
gpus=$1 | ||
fi | ||
|
||
echo "Launching training job using $gpus GPUs" | ||
set -ex | ||
|
||
# p3 instances have larger GPU memory, so a higher batch size can be used | ||
GPU_MEM=`nvidia-smi --query-gpu=memory.total --format=csv,noheader -i 0 | awk '{print $1}'` | ||
if [ $GPU_MEM -gt 15000 ] ; then BATCH_SIZE=256; else BATCH_SIZE=128; fi | ||
|
||
# Training | ||
mpirun -np $gpus -hostfile hosts -mca plm_rsh_no_tree_spawn 1 \ | ||
-bind-to socket -map-by slot \ | ||
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \ | ||
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ | ||
-x TF_CPP_MIN_LOG_LEVEL=0 \ | ||
python -W ignore train_imagenet_resnet_hvd.py \ | ||
--data_dir ~/data/tf-imagenet/ --num_epochs 90 -b $BATCH_SIZE \ | ||
--lr_decay_mode poly --warmup_epochs 10 --clear_log | ||
|
||
NUM_GPUS_MASTER=`nvidia-smi -L | wc -l` | ||
|
||
# Evaluation | ||
# Using only the gpus on master node for evaluation as we saved checkpoints only on master node | ||
# pass num_gpus it was trained on to print the epoch numbers correctly | ||
mpirun -np $NUM_GPUS_MASTER -mca plm_rsh_no_tree_spawn 1 \ | ||
-bind-to socket -map-by slot \ | ||
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \ | ||
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ | ||
-x TF_CPP_MIN_LOG_LEVEL=0 \ | ||
python -W ignore train_imagenet_resnet_hvd.py \ | ||
--data_dir ~/data/tf-imagenet/ --num_epochs 90 \ | ||
--eval --num_gpus $gpus |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.