Skip to content

Commit

Permalink
change: add buildspec-release file and upgrade cuda version (#109)
Browse files Browse the repository at this point in the history
  • Loading branch information
chuyang-deng committed Jun 13, 2019
1 parent 5b4466c commit cb9f417
Show file tree
Hide file tree
Showing 9 changed files with 195 additions and 30 deletions.
1 change: 1 addition & 0 deletions VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1.2.0
155 changes: 155 additions & 0 deletions buildspec-release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
version: 0.2

env:
variables:
FRAMEWORK_VERSION: '1.1.0'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
CPU_PY_VERSION: '2'
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
GPU_PY_VERSION: '3'
LOCAL_BASE_REPO: 'pytorch-base'
ECR_REPO: 'sagemaker-pytorch'
GITHUB_REPO: 'sagemaker-pytorch-container'
SETUP_FILE: 'setup_cmds.sh'
SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]'


phases:
pre_build:
commands:
- start-dockerd
- ACCOUNT=$(aws sts get-caller-identity --query 'Account' --output text)
- PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO"
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'

build:
commands:
# prepare the release
- git-release --prepare --min-version 1.2.0

# install
- pip3 install -U -e .
- pip3 install -U -e .[test]

# run unit tests
- pytest test/unit

# build cpu base image
- base_dir="docker/$FRAMEWORK_VERSION/base"
- cpu_py2_base_tag="$FRAMEWORK_VERSION-cpu-py2"
- cpu_py3_base_tag="$FRAMEWORK_VERSION-cpu-py3"
- cpu_dockerfile="Dockerfile.cpu"
- cd $base_dir
- docker build -t $LOCAL_BASE_REPO:$cpu_py2_base_tag -f $cpu_dockerfile --build-arg py_version=2 .
- docker build -t $LOCAL_BASE_REPO:$cpu_py3_base_tag -f $cpu_dockerfile --build-arg py_version=3 .
- cd ../../../

# build gpu base image
- gpu_py2_base_tag="$FRAMEWORK_VERSION-gpu-py2"
- gpu_py3_base_tag="$FRAMEWORK_VERSION-gpu-py3"
- gpu_dockerfile="Dockerfile.gpu"
- cd $base_dir
- docker build -t $LOCAL_BASE_REPO:$gpu_py2_base_tag -f $gpu_dockerfile --build-arg py_version=2 .
- docker build -t $LOCAL_BASE_REPO:$gpu_py3_base_tag -f $gpu_dockerfile --build-arg py_version=3 .
- cd ../../../

# create wheel
- python3 setup.py bdist_wheel

# build cpu image
- build_dir="docker/$FRAMEWORK_VERSION/final"
- CPU_PY2_TAG="$FRAMEWORK_VERSION-cpu-py2"
- CPU_PY3_TAG="$FRAMEWORK_VERSION-cpu-py3"
- docker build -f "$build_dir/$cpu_dockerfile" --build-arg py_version=2 -t $PREPROD_IMAGE:$CPU_PY2_TAG .
- docker build -f "$build_dir/$cpu_dockerfile" --build-arg py_version=3 -t $PREPROD_IMAGE:$CPU_PY3_TAG .

# build gpu image
- GPU_PY2_TAG="$FRAMEWORK_VERSION-gpu-py2"
- GPU_PY3_TAG="$FRAMEWORK_VERSION-gpu-py3"
- docker build -f "$build_dir/$gpu_dockerfile" --build-arg py_version=2 -t $PREPROD_IMAGE:$GPU_PY2_TAG .
- docker build -f "$build_dir/$gpu_dockerfile" --build-arg py_version=3 -t $PREPROD_IMAGE:$GPU_PY3_TAG .

# push images to ecr
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker push $PREPROD_IMAGE:$CPU_PY2_TAG
- docker push $PREPROD_IMAGE:$CPU_PY3_TAG
- docker push $PREPROD_IMAGE:$GPU_PY2_TAG
- docker push $PREPROD_IMAGE:$GPU_PY3_TAG

# launch remote gpu instance
- prefix='ml.'
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
- create-key-pair
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu

# run cpu integration tests
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu
else
echo "skipping cpu integration tests"
fi
# run gpu integration tests
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
printf "$SETUP_CMDS" > $SETUP_FILE
cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY_VERSION --processor gpu"
remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number "$PR_NUM"
else
echo "skipping gpu integration tests"
fi
# run cpu sagemaker tests
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE
else
echo "skipping cpu sagemaker tests"
fi
# run gpu sagemaker tests
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE
else
echo "skipping gpu sagemaker tests"
fi
- |
echo '[{
"repository": "sagemaker-pytorch",
"tags": [{
"source": "1.1.0-cpu-py2",
"dest": ["1.1.0-cpu-py2", "1.1-cpu-py2", "1.1.0-cpu-py2-'${CODEBUILD_BUILD_ID#*:}'"]
},{
"source": "1.1.0-cpu-py3",
"dest": ["1.1.0-cpu-py3", "1.1-cpu-py3", "1.1.0-cpu-py3-'${CODEBUILD_BUILD_ID#*:}'"]
},{
"source": "1.1.0-gpu-py2",
"dest": ["1.1.0-gpu-py2", "1.1-gpu-py2", "1.1.0-gpu-py2-'${CODEBUILD_BUILD_ID#*:}'"]
},{
"source": "1.1.0-gpu-py3",
"dest": ["1.1.0-gpu-py3", "1.1-gpu-py3", "1.1.0-gpu-py3-'${CODEBUILD_BUILD_ID#*:}'"]
}],
"test": [
"IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --py-version 3 --processor cpu --region {region} --docker-base-name '$ECR_REPO' --aws-id 520713654638 --framework-version '$FRAMEWORK_VERSION' --instance-type '$CPU_INSTANCE_TYPE'",
"IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --py-version 2 --processor cpu --region {region} --docker-base-name '$ECR_REPO' --aws-id 520713654638 --framework-version '$FRAMEWORK_VERSION' --instance-type '$CPU_INSTANCE_TYPE'",
"IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --py-version 3 --processor gpu --region {region} --docker-base-name '$ECR_REPO' --aws-id 520713654638 --framework-version '$FRAMEWORK_VERSION' --instance-type '$GPU_INSTANCE_TYPE'",
"IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --py-version 2 --processor gpu --region {region} --docker-base-name '$ECR_REPO' --aws-id 520713654638 --framework-version '$FRAMEWORK_VERSION' --instance-type '$GPU_INSTANCE_TYPE'"
]
}]' > deployments.json
# publish the release to github
- git-release --publish

finally:
# shut down remote gpu instance
- cleanup-gpu-instances
- cleanup-key-pairs

artifacts:
files:
- deployments.json
name: ARTIFACT_1
47 changes: 29 additions & 18 deletions buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,36 +34,45 @@ phases:

# build cpu base image
- base_dir="docker/$FRAMEWORK_VERSION/base"
- cpu_base_tag="$FRAMEWORK_VERSION-cpu-py$CPU_PY_VERSION"
- cpu_py2_base_tag="$FRAMEWORK_VERSION-cpu-py2"
- cpu_py3_base_tag="$FRAMEWORK_VERSION-cpu-py3"
- cpu_dockerfile="Dockerfile.cpu"
- cd $base_dir
- docker build -t $LOCAL_BASE_REPO:$cpu_base_tag -f $cpu_dockerfile --build-arg py_version=$CPU_PY_VERSION .
- docker build -t $LOCAL_BASE_REPO:$cpu_py2_base_tag -f $cpu_dockerfile --build-arg py_version=2 .
- docker build -t $LOCAL_BASE_REPO:$cpu_py3_base_tag -f $cpu_dockerfile --build-arg py_version=3 .
- cd ../../../

# build gpu base image
- gpu_base_tag="$FRAMEWORK_VERSION-gpu-py$GPU_PY_VERSION"
- gpu_py2_base_tag="$FRAMEWORK_VERSION-gpu-py2"
- gpu_py3_base_tag="$FRAMEWORK_VERSION-gpu-py3"
- gpu_dockerfile="Dockerfile.gpu"
- cd $base_dir
- docker build -t $LOCAL_BASE_REPO:$gpu_base_tag -f $gpu_dockerfile --build-arg py_version=$GPU_PY_VERSION .
- docker build -t $LOCAL_BASE_REPO:$gpu_py2_base_tag -f $gpu_dockerfile --build-arg py_version=2 .
- docker build -t $LOCAL_BASE_REPO:$gpu_py3_base_tag -f $gpu_dockerfile --build-arg py_version=3 .
- cd ../../../

# create wheel
- python3 setup.py bdist_wheel

# build cpu image
- build_dir="docker/$FRAMEWORK_VERSION/final"
- build_id="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
- CPU_TAG="$FRAMEWORK_VERSION-cpu-py$CPU_PY_VERSION-$build_id"
- docker build -f "$build_dir/$cpu_dockerfile" --build-arg py_version=$CPU_PY_VERSION -t $PREPROD_IMAGE:$CPU_TAG .
- CPU_PY2_TAG="$FRAMEWORK_VERSION-cpu-py2"
- CPU_PY3_TAG="$FRAMEWORK_VERSION-cpu-py3"
- docker build -f "$build_dir/$cpu_dockerfile" --build-arg py_version=2 -t $PREPROD_IMAGE:$CPU_PY2_TAG .
- docker build -f "$build_dir/$cpu_dockerfile" --build-arg py_version=3 -t $PREPROD_IMAGE:$CPU_PY3_TAG .

# build gpu image
- GPU_TAG="$FRAMEWORK_VERSION-gpu-py$GPU_PY_VERSION-$build_id"
- docker build -f "$build_dir/$gpu_dockerfile" --build-arg py_version=$GPU_PY_VERSION -t $PREPROD_IMAGE:$GPU_TAG .
- GPU_PY2_TAG="$FRAMEWORK_VERSION-gpu-py2"
- GPU_PY3_TAG="$FRAMEWORK_VERSION-gpu-py3"
- docker build -f "$build_dir/$gpu_dockerfile" --build-arg py_version=2 -t $PREPROD_IMAGE:$GPU_PY2_TAG .
- docker build -f "$build_dir/$gpu_dockerfile" --build-arg py_version=3 -t $PREPROD_IMAGE:$GPU_PY3_TAG .

# push images to ecr
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker push $PREPROD_IMAGE:$CPU_TAG
- docker push $PREPROD_IMAGE:$GPU_TAG
- docker push $PREPROD_IMAGE:$CPU_PY2_TAG
- docker push $PREPROD_IMAGE:$CPU_PY3_TAG
- docker push $PREPROD_IMAGE:$GPU_PY2_TAG
- docker push $PREPROD_IMAGE:$GPU_PY3_TAG

# launch remote gpu instance
- prefix='ml.'
Expand All @@ -74,7 +83,7 @@ phases:
# run cpu integration tests
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu
pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu
else
echo "skipping cpu integration tests"
fi
Expand All @@ -83,24 +92,24 @@ phases:
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
printf "$SETUP_CMDS" > $SETUP_FILE
cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY_VERSION --processor gpu"
remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number "$PR_NUM"
py3_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY_VERSION --processor gpu"
remote-test --github-repo $GITHUB_REPO --test-cmd "$py3_cmd" --setup-file $SETUP_FILE --pr-number "$PR_NUM"
else
echo "skipping gpu integration tests"
fi
# run cpu sagemaker tests
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $CPU_TAG --instance-type $CPU_INSTANCE_TYPE
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE
else
echo "skipping cpu sagemaker tests"
fi
# run gpu sagemaker tests
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $GPU_TAG --instance-type $GPU_INSTANCE_TYPE
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE
else
echo "skipping gpu sagemaker tests"
fi
Expand All @@ -111,5 +120,7 @@ phases:
- cleanup-key-pairs

# remove ecr image
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_PY2_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_PY3_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_PY2_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_PY3_TAG
2 changes: 1 addition & 1 deletion docker/1.1.0/base/Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu16.04

ARG py_version

Expand Down
2 changes: 1 addition & 1 deletion test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,4 +188,4 @@ def skip_by_device_type(request, use_gpu, instance_type):
@pytest.fixture(autouse=True)
def skip_by_py_version(request, py_version):
if request.node.get_closest_marker('skip_py2') and py_version != 'py3':
pytest.skip('Skipping the mnist distributed training test if py_version is py2 for now')
pytest.skip('Skipping the test because Python 2 is not supported.')
2 changes: 1 addition & 1 deletion test/integration/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@
call_model_fn_once_script = os.path.join(resources_path, 'call_model_fn_once.py')

ROLE = 'dummy/unused-role'
DEFAULT_TIMEOUT = 10
DEFAULT_TIMEOUT = 20
PYTHON3 = 'py3'
9 changes: 3 additions & 6 deletions test/integration/sagemaker/test_distributed_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from six.moves.urllib.parse import urlparse

from test.integration import (data_dir, dist_operations_path, fastai_path, mnist_script,
DEFAULT_TIMEOUT, PYTHON3)
DEFAULT_TIMEOUT)
from test.integration.sagemaker.timeout import timeout

MULTI_GPU_INSTANCE = 'ml.p3.8xlarge'
Expand All @@ -44,11 +44,8 @@ def test_dist_operations_multi_gpu(sagemaker_session, ecr_image, dist_gpu_backen


@pytest.mark.skip_cpu
def test_dist_operations_fastai_gpu(sagemaker_session, ecr_image, py_version):
if py_version != PYTHON3:
print('Skipping the test because fastai supports >= Python 3.6.')
return

@pytest.mark.skip_py2
def test_dist_operations_fastai_gpu(sagemaker_session, ecr_image):
with timeout(minutes=DEFAULT_TIMEOUT):
pytorch = PyTorch(entry_point='train_cifar.py',
source_dir=os.path.join(fastai_path, 'cifar'),
Expand Down
2 changes: 1 addition & 1 deletion test/integration/sagemaker/test_mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import pytest
from sagemaker.pytorch import PyTorch

from test.integration import training_dir, mnist_script, DEFAULT_TIMEOUT, PYTHON3
from test.integration import training_dir, mnist_script, DEFAULT_TIMEOUT
from test.integration.sagemaker.timeout import timeout, timeout_and_delete_endpoint


Expand Down
5 changes: 3 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ exclude =
.git
__pycache__
.tox
tests/resources/
test/resources/
lib/
max-complexity = 10
ignore =
Expand Down Expand Up @@ -47,11 +47,12 @@ deps =
pytest-cov
pytest-xdist
mock
requests == 2.18.4
requests == 2.20.0
urllib3 < 1.23, >= 1.21
sagemaker
sagemaker-containers
torch
torchvision
retrying
six

Expand Down

0 comments on commit cb9f417

Please sign in to comment.