Skip to content

Commit

Permalink
Ignore zombie processes when detecting TorchServe status (#166)
Browse files Browse the repository at this point in the history
* Ignore processes that are not running when detecting TorchServe status

* Detect zombie processes and ignore them before calling cmdline API

* Update PT DLC framework version to 2.1.0 and 2.2.0

* Update instance type to ensure newer CUDA driver version

* upgrade DLAMI version for tests

* Revert "Update instance type to ensure newer CUDA driver version"

This reverts commit bba00bd.
  • Loading branch information
namannandan committed May 31, 2024
1 parent 36a842e commit 9a24052
Show file tree
Hide file tree
Showing 7 changed files with 11 additions and 8 deletions.
6 changes: 3 additions & 3 deletions buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 0.2

env:
variables:
FRAMEWORK_VERSIONS: '2.0.0 2.0.1'
FRAMEWORK_VERSIONS: '2.1.0 2.2.0'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
GPU_INSTANCE_TYPE: 'ml.g4dn.12xlarge'
ECR_REPO: 'sagemaker-test'
Expand Down Expand Up @@ -49,12 +49,12 @@ phases:
- $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- create-key-pair

# launch remote GPU instance with Deep Learning AMI GPU PyTorch 1.9 (Ubuntu 20.04)
# launch remote GPU instance with Deep Learning AMI GPU PyTorch 2.2 (Ubuntu 20.04)
# build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
- |
for FRAMEWORK_VERSION in $FRAMEWORK_VERSIONS;
do
launch-ec2-instance --instance-type $instance_type --ami-name ami-03e3ef8c92fdb39ad;
launch-ec2-instance --instance-type $instance_type --ami-name ami-081c4092fbff425f0;
DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID";
build_dir="test/container/$FRAMEWORK_VERSION";
docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .;
Expand Down
3 changes: 3 additions & 0 deletions src/sagemaker_pytorch_serving_container/torchserve.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ def _retrieve_ts_server_process():
ts_server_processes = list()

for process in psutil.process_iter():
if process.status() == psutil.STATUS_ZOMBIE:
continue

if TS_NAMESPACE in process.cmdline():
ts_server_processes.append(process)

Expand Down
2 changes: 1 addition & 1 deletion test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def pytest_addoption(parser):
parser.addoption('--instance-type')
parser.addoption('--docker-base-name', default='sagemaker-pytorch-inference')
parser.addoption('--region', default='us-west-2')
parser.addoption('--framework-version', default="2.0.0")
parser.addoption('--framework-version', default="2.1.0")
parser.addoption('--py-version', choices=['2', '3'], default='3')
parser.addoption('--processor', choices=['gpu', 'cpu'], default='cpu')
# If not specified, will default to {framework-version}-{processor}-py{py-version}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.0.0-cpu-py310-ubuntu20.04-sagemaker
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.1.0-cpu-py310-ubuntu20.04-sagemaker

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.0.0-gpu-py310-cu118-ubuntu20.04-sagemaker
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.1.0-gpu-py310-cu118-ubuntu20.04-sagemaker

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.0.1-cpu-py310-ubuntu20.04-sagemaker
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.2.0-cpu-py310-ubuntu20.04-sagemaker

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.0.1-gpu-py310-cu118-ubuntu20.04-sagemaker
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:2.2.0-gpu-py310-cu118-ubuntu20.04-sagemaker

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz

Expand Down

0 comments on commit 9a24052

Please sign in to comment.