From e78bf86e431c877aa6b4ed496d17ce6ee53106b8 Mon Sep 17 00:00:00 2001 From: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> Date: Thu, 20 Oct 2022 15:10:20 -0700 Subject: [PATCH] Add neuron enhancements (#2355) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [test] Add efa test as placeholder (#185) * [pytorch][sagemaker] PT 1.8.0 cu110 EFA support (#171) * PT 1.7.1 cu110 EFA support * rebase PT 1.7.1 dockerfile and add EFA to PT 1.8.0 dockerfile * Install hwloc, dependency of smdataparallel * Disabled smdataparallel integration test temporarily since current smdataparallel wheel is incompatible with EFA * Updated EFA version to 1.11.2 which comes with MPI v4.1.0 * fix nccl version and add test * update mpi * fix style * Fixed NCCL branch name and moved the Horovod installation before SM Distributed * Disable the framework build and test which is not applicable to this PR * fix failing test * Add MPI flags for EFA * Fixed pytorch nccl version test * Fixed pytorch nccl version python test and disable fresh builds * Disable new builds and enabled smdataparallel test * Re-trigger CI * Revert build config changes Co-authored-by: Lai Wei Co-authored-by: Akhil Mehra * [TensorFlow][Sagemaker] TF 2.4 cu110 EFA support (#172) * TF 2.4 cu110 EFA support * Added -g option for EFA installer * Update NCCL installation * Fixed NCCL installation * Add constant at top * Install hwloc, dependency of smdataparallel * Disabled smdataparallel integration test temporarily since current smdataparallel wheel is incompatible with EFA * Updated EFA version to 1.11.2 which comes with MPI v4.1.0 * update OPEN_MPI * Install NCCL from source and updated the openMPI path * Re-trigger CI * Disable the framework build and test which is not applicable to this PR and added EFA related flag * Fix mpi flag failure * Add correct runtime MPI flags * Add correct MPI flags, modify build config * Disable new builds and Fixed SM Horovod test * Enabled smdataparallel test * Removed building NCCL with specific arch. Use default config which builds for all arch * Revert build config changes Co-authored-by: yselivonchyk Co-authored-by: Akhil Mehra * Run PT to test EFA (#191) add sanity efa test * [pytorch] | [test] | [sagemaker] SMModel Parallel pytorch EFA tests on p3dn (#187) SMModel Parallel pytorch EFA tests Co-authored-by: Jeetendra Patil Co-authored-by: Karan Jariwala Co-authored-by: Lai Wei Co-authored-by: yselivonchyk * [tensorflow] | [test] | [sagemaker] (#188) add efa test for tf2 Co-authored-by: Jeetendra Patil Co-authored-by: Karan Jariwala Co-authored-by: Lai Wei Co-authored-by: yselivonchyk * Run PT Rubik EFA test (#194) * run pt efa rubik * skip inference * revert * Run rubik efa tests on tf2 (#195) * run rubik efa tests on tf2 * [test][sagemaker] Add reupload_image_to_test_ecr to SM tests conftest (#193) * [PyTorch][test][sagemaker] EFA test for smdataparallel (#189) EFA test for smdataparallel * [habana] Placeholder for Build and Test Functionality for Habana (#197) * [habana] build functionality * modify habana dedicated flag * enable habana build * build config changes * add pytorch and modify test configuration * move build artifact * test support for habana * nit changes * build changes * nit change * support for SM and benchmark * address comments * build eia and neuron * enable new builds * nit * revert temp configs * remove dead code from eks test * [Habana] Add changeset logic (#198) * changeset logic for habana * enable habana mode * test buildspec * change dockerfiles * disable habana mode and revert changes * remove unwanted code * [test] Run test using existing EC2 instance locally (#201) * Run test using existing EC2 instance * rename pytest fixture * Removing any SM related installs from Dockerfile (#200) * Removing any SM related installs * Cleaned Dockerfile.Added 2.5 folder Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [pytorch/tensorflow] Habana DLC python 3.7, OMPI in base installer and pytorch DLC fixes (#202) * Habana Pytorch DLC and OMPI Install In Habana Bases * Fix docker path * Rebased and added TF2.5 * Update pytorch to 0.15.0 synapse * Updated Pytorch docker file (#204) * Updated Pytorch docker file. Also updated buildspec to pull whl from s3 bucket * Removed SM packages. Added few more pythom packages. Renamed folder to 0.15 * Minor fix in buildspec * build habana images * correct build config * disable build config Co-authored-by: tejaschumbalkar * Update buildspec.yml (#206) Updated pytorch wheel. Added HPUBase for test cases. * SynapseAI 0.15.0 Release DLC Changes (#205) * SynapseAI 0.15.0 Release * Add example branch parse and Habana PR build * Fix extra slash * Revert ENABLE_HABANA_MODE * [Habana][Build] Fix torchvision python version py37 (#207) * Fix torchvision python version py37 * Updated h5py version to 3.1.0 * enable habana mode and disable test * Using pypi package for torchvision * add docker build artifacts * add build artifacts references to buildspec * revert config Co-authored-by: tejaschumbalkar * SynapseAI v0.15.1 release updates (#208) * SynapseAI v0.15.1 release updates * build habana switch on * fix pt parse * ENABLE_HABANA_MODE=False * Updating TF binaries with callback fixes (#210) * Updating TF binaries with callback fixes * Enabling Habana build * Resetting ENABLE_HABANA_MODE=False * SynapseAI v0.15.2 release updates (#209) * SynapseAI v0.15.2 release updates * SynapseAI v0.15.2 release updates * Fix folder naming * Re-Disable ENABLE_HABANA_MODE in build_config.py * SynapseAI v0.15.2 release updates * SynapseAI v0.15.2 release updates * Fix folder naming * Re-Disable ENABLE_HABANA_MODE in build_config.py * Updating Torchvision binary (#211) * Updating Torchvision binary as we need to build with same setup as pytorch for compatibilty * Enabling Habana mode * Reset ENABLE_HABANA_MODE= False * SynapseAI v0.15.3 release updates (#213) * SynapseAI v0.15.3 release updates * SynapseAI v0.15.3 release updates * Enable Habana Mode * Disable Habana Mode * address rebase modifications * [DO NOT MERGE] [autogluon][build, test] Initial PR for training containers (#214) * [autogluon][build, test] fixing instance types (#218) * format ecr repo from image uri (#217) * format ecr repo from image uri * pytest markers for hpu test * more markers * nit habana changes * [habana][build] fix docker entrypoint (#219) * fix docker entrypoint * revert habana mode * Fixed version in autogluon buildspec (#215) * Fixed version in autogluon buildspec * Enabling sagemaker tests * Enable building a new container * Added MAJOR_VERSION into docker files, added autogluon_training fixture * [autogluon][test] SageMaker remote mode tests * [autogluon][test] removed datasets requirement Co-authored-by: Sergey Togulev Co-authored-by: Alexander Shirkov * [autogluon][test] tests fixes (#220) * [autogluon][test] tests fixes * [autogluon][test] tests fixes * [autogluon][test] removed jupyter dependencies leftovers * [autogluon][test] removed jupyter dependencies leftovers * [autogluon][test] version checks fixes * [autogluon][test] pip check fixes * [autogluon][test] pip check fixes * [autogluon][test] sm_local tests fixes * [autogluon][test] sm_local tests fixes * [autogluon][test] applied pillow security fixes to autogluon * [autogluon][test] removed jupyter dependencies leftovers * [build][test]Rolling back default parameters changes (#224) * Rolling back default parameters changes * [autogluon][test] test fixes Co-authored-by: Sergey Togulev Co-authored-by: Alexander Shirkov * [autogluon][release]Releasing Autogluon 0.2.1 (#227) Co-authored-by: Sergey Togulev * [autogluon][test]Fixes for AG sanity tests (#226) Co-authored-by: Sergey Togulev * [release] Fixed release notes logic (#228) Co-authored-by: Sergey Togulev * [release] Fix for AG release notes (#229) * [release] Fixed release notes logic * [release] Fixed release notes logic Co-authored-by: Sergey Togulev * [autogluon][release] Release AG container (#230) * [release] Fixed release notes logic * [release] Fixed release notes logic * [release] Fixed release notes logic Co-authored-by: Sergey Togulev * [release] Fix for imp_pip_packages (#231) * [release] Fixed release notes logic * [release] Fixed release notes logic * [release] Fixed release notes logic * [release] Fixed release notes logic * [release] Fixed release notes logic Co-authored-by: Sergey Togulev * Ag release (#232) * [release] Fixed release notes logic * [release] Fixed release notes logic * [release] Fixed release notes logic * [release] Fixed release notes logic * [release] Fixed release notes logic * [autogluon][build] Build AG 0.3.0 Co-authored-by: Sergey Togulev * [habana] fix pip check requirements (#225) * habana sanity test * reinstall boto3 * upgrade boto3 * remove comments * revert temp configs * [test] Merger testrunner from public (#234) Co-authored-by: Sergey Togulev * SynapseAI v0.15.4 release updates (#233) * SynapseAI v0.15.4 release updates * SynapseAI v0.15.4 release updates * Enable Habana Mode * Revert "Enable Habana Mode" This reverts commit 9ed1a8f58d2d5c71977ff0cc660e3228c3dd8874. * [test] Building AG 0.2.1 (#236) Co-authored-by: Sergey Togulev * Remove hb-torch & install into --user for python packages (#237) * Remove hb-torch before installing AWS torch * python packages to user space install * add -y to uninstall * enable habana mode * disable habana mode Co-authored-by: tejaschumbalkar * [build] habana build modifications (#238) * habana build modifications * run test safety * make sanity test compatible with hpu processor * fix sanity test * sync up utility test changes from public repo * address comments * revert temp config * release habana dlc to gamma stage (#243) * [release] fix numbering on release_images.yml (#244) * fix_numbering * move syai inside of job_type * remove PT1.7 and TF2.5 from release_images.yml (#245) * Remove keras package before installing tensorflow (#247) * Remove keras package before installing tensorflow * Enable habana_mode * run test safety * disable habana mode * revert safety test changes Co-authored-by: tejaschumbalkar * Bump tensorflow in /test/sagemaker_tests/huggingface_tensorflow/training (#242) Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.5.0 to 2.5.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.5.0...v2.5.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [hopper][build] Add hopper build code (#246) * Merge master into private-master (#248) * [test] Add hopper_mode to quick checks tests (#251) * [test] Add efa test as placeholder (#185) * [pytorch][sagemaker] PT 1.8.0 cu110 EFA support (#171) * PT 1.7.1 cu110 EFA support * rebase PT 1.7.1 dockerfile and add EFA to PT 1.8.0 dockerfile * Install hwloc, dependency of smdataparallel * Disabled smdataparallel integration test temporarily since current smdataparallel wheel is incompatible with EFA * Updated EFA version to 1.11.2 which comes with MPI v4.1.0 * fix nccl version and add test * update mpi * fix style * Fixed NCCL branch name and moved the Horovod installation before SM Distributed * Disable the framework build and test which is not applicable to this PR * fix failing test * Add MPI flags for EFA * Fixed pytorch nccl version test * Fixed pytorch nccl version python test and disable fresh builds * Disable new builds and enabled smdataparallel test * Re-trigger CI * Revert build config changes Co-authored-by: Lai Wei Co-authored-by: Akhil Mehra * [TensorFlow][Sagemaker] TF 2.4 cu110 EFA support (#172) * TF 2.4 cu110 EFA support * Added -g option for EFA installer * Update NCCL installation * Fixed NCCL installation * Add constant at top * Install hwloc, dependency of smdataparallel * Disabled smdataparallel integration test temporarily since current smdataparallel wheel is incompatible with EFA * Updated EFA version to 1.11.2 which comes with MPI v4.1.0 * update OPEN_MPI * Install NCCL from source and updated the openMPI path * Re-trigger CI * Disable the framework build and test which is not applicable to this PR and added EFA related flag * Fix mpi flag failure * Add correct runtime MPI flags * Add correct MPI flags, modify build config * Disable new builds and Fixed SM Horovod test * Enabled smdataparallel test * Removed building NCCL with specific arch. Use default config which builds for all arch * Revert build config changes Co-authored-by: yselivonchyk Co-authored-by: Akhil Mehra * Run PT to test EFA (#191) add sanity efa test * [pytorch] | [test] | [sagemaker] SMModel Parallel pytorch EFA tests on p3dn (#187) SMModel Parallel pytorch EFA tests Co-authored-by: Jeetendra Patil Co-authored-by: Karan Jariwala Co-authored-by: Lai Wei Co-authored-by: yselivonchyk * [tensorflow] | [test] | [sagemaker] (#188) add efa test for tf2 Co-authored-by: Jeetendra Patil Co-authored-by: Karan Jariwala Co-authored-by: Lai Wei Co-authored-by: yselivonchyk * Run PT Rubik EFA test (#194) * run pt efa rubik * skip inference * revert * Run rubik efa tests on tf2 (#195) * run rubik efa tests on tf2 * [test][sagemaker] Add reupload_image_to_test_ecr to SM tests conftest (#193) * [PyTorch][test][sagemaker] EFA test for smdataparallel (#189) EFA test for smdataparallel * [habana] Placeholder for Build and Test Functionality for Habana (#197) * [habana] build functionality * modify habana dedicated flag * enable habana build * build config changes * add pytorch and modify test configuration * move build artifact * test support for habana * nit changes * build changes * nit change * support for SM and benchmark * address comments * build eia and neuron * enable new builds * nit * revert temp configs * remove dead code from eks test * [Habana] Add changeset logic (#198) * changeset logic for habana * enable habana mode * test buildspec * change dockerfiles * disable habana mode and revert changes * remove unwanted code * [test] Run test using existing EC2 instance locally (#201) * Run test using existing EC2 instance * rename pytest fixture * Removing any SM related installs from Dockerfile (#200) * Removing any SM related installs * Cleaned Dockerfile.Added 2.5 folder Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [pytorch/tensorflow] Habana DLC python 3.7, OMPI in base installer and pytorch DLC fixes (#202) * Habana Pytorch DLC and OMPI Install In Habana Bases * Fix docker path * Rebased and added TF2.5 * Update pytorch to 0.15.0 synapse * Updated Pytorch docker file (#204) * Updated Pytorch docker file. Also updated buildspec to pull whl from s3 bucket * Removed SM packages. Added few more pythom packages. Renamed folder to 0.15 * Minor fix in buildspec * build habana images * correct build config * disable build config Co-authored-by: tejaschumbalkar * Update buildspec.yml (#206) Updated pytorch wheel. Added HPUBase for test cases. * SynapseAI 0.15.0 Release DLC Changes (#205) * SynapseAI 0.15.0 Release * Add example branch parse and Habana PR build * Fix extra slash * Revert ENABLE_HABANA_MODE * [Habana][Build] Fix torchvision python version py37 (#207) * Fix torchvision python version py37 * Updated h5py version to 3.1.0 * enable habana mode and disable test * Using pypi package for torchvision * add docker build artifacts * add build artifacts references to buildspec * revert config Co-authored-by: tejaschumbalkar * SynapseAI v0.15.1 release updates (#208) * SynapseAI v0.15.1 release updates * build habana switch on * fix pt parse * ENABLE_HABANA_MODE=False * Updating TF binaries with callback fixes (#210) * Updating TF binaries with callback fixes * Enabling Habana build * Resetting ENABLE_HABANA_MODE=False * SynapseAI v0.15.2 release updates (#209) * SynapseAI v0.15.2 release updates * SynapseAI v0.15.2 release updates * Fix folder naming * Re-Disable ENABLE_HABANA_MODE in build_config.py * SynapseAI v0.15.2 release updates * SynapseAI v0.15.2 release updates * Fix folder naming * Re-Disable ENABLE_HABANA_MODE in build_config.py * Updating Torchvision binary (#211) * Updating Torchvision binary as we need to build with same setup as pytorch for compatibilty * Enabling Habana mode * Reset ENABLE_HABANA_MODE= False * SynapseAI v0.15.3 release updates (#213) * SynapseAI v0.15.3 release updates * SynapseAI v0.15.3 release updates * Enable Habana Mode * Disable Habana Mode * address rebase modifications * [DO NOT MERGE] [autogluon][build, test] Initial PR for training containers (#214) * [autogluon][build, test] fixing instance types (#218) * format ecr repo from image uri (#217) * format ecr repo from image uri * pytest markers for hpu test * more markers * nit habana changes * [habana][build] fix docker entrypoint (#219) * fix docker entrypoint * revert habana mode * Fixed version in autogluon buildspec (#215) * Fixed version in autogluon buildspec * Enabling sagemaker tests * Enable building a new container * Added MAJOR_VERSION into docker files, added autogluon_training fixture * [autogluon][test] SageMaker remote mode tests * [autogluon][test] removed datasets requirement Co-authored-by: Sergey Togulev Co-authored-by: Alexander Shirkov * [autogluon][test] tests fixes (#220) * [autogluon][test] tests fixes * [autogluon][test] tests fixes * [autogluon][test] removed jupyter dependencies leftovers * [autogluon][test] removed jupyter dependencies leftovers * [autogluon][test] version checks fixes * [autogluon][test] pip check fixes * [autogluon][test] pip check fixes * [autogluon][test] sm_local tests fixes * [autogluon][test] sm_local tests fixes * [autogluon][test] applied pillow security fixes to autogluon * [autogluon][test] removed jupyter dependencies leftovers * [build][test]Rolling back default parameters changes (#224) * Rolling back default parameters changes * [autogluon][test] test fixes Co-authored-by: Sergey Togulev Co-authored-by: Alexander Shirkov * [autogluon][test]Fixes for AG sanity tests (#226) Co-authored-by: Sergey Togulev * [release] Fixed release notes logic (#228) Co-authored-by: Sergey Togulev * [release] Fix for imp_pip_packages (#231) * [release] Fixed release notes logic * [release] Fixed release notes logic * [release] Fixed release notes logic * [release] Fixed release notes logic * [release] Fixed release notes logic Co-authored-by: Sergey Togulev * [habana] fix pip check requirements (#225) * habana sanity test * reinstall boto3 * upgrade boto3 * remove comments * revert temp configs * SynapseAI v0.15.4 release updates (#233) * SynapseAI v0.15.4 release updates * SynapseAI v0.15.4 release updates * Enable Habana Mode * Revert "Enable Habana Mode" This reverts commit 9ed1a8f58d2d5c71977ff0cc660e3228c3dd8874. * Remove hb-torch & install into --user for python packages (#237) * Remove hb-torch before installing AWS torch * python packages to user space install * add -y to uninstall * enable habana mode * disable habana mode Co-authored-by: tejaschumbalkar * [build] habana build modifications (#238) * habana build modifications * run test safety * make sanity test compatible with hpu processor * fix sanity test * sync up utility test changes from public repo * address comments * revert temp config * remove PT1.7 and TF2.5 from release_images.yml (#245) * Remove keras package before installing tensorflow (#247) * Remove keras package before installing tensorflow * Enable habana_mode * run test safety * disable habana mode * revert safety test changes Co-authored-by: tejaschumbalkar * [hopper][build] Add hopper build code (#246) * Merge master into private-master (#248) * [test] Add hopper_mode to quick checks tests (#251) * followup sync changes * [hopper][build] sync hopper dockerfiles with huggingface dockerfiles (#254) * [hopper][build] sync hopper dockerfiles with huggingface dockerfiles * Enable hopper mode * Fix bug with CI for Hopper * Use py38 wheel and disable debug env vars * Update xla wheel and set buildspec correctly for hopper * Fix framework path and artifact name * Fix framework version path * Disable hopper mode Co-authored-by: Sai Parthasarathy Miduthuri * [hopper][build] Add more wheels for hopper (#258) * buildspec and status modifications (#261) * [hopper][pytorch][test] Fix horovod tests (#266) * Reinstall horovod for hopper * Enable hopper mode * Remove hopper dedicated * Revert hopper dedicated * Update dlc_developer_config.toml * [hopper][test] Fix getting framework for hopper (#265) * [hopper][test] Fix getting framework for hopper * Add dummy change to trigger build * Add dummy change in buildspec to trigger build * Add dummy change in dockerfile * Remove hopper dedicated * Update main.py * Update main.py * Update main.py * Remove dummy changes * Update dlc_developer_config.toml * [hopper][pytorch][build] Update transformers wheel (#267) * [hopper][pytorch][build] Update transformers wheel to the latest (#269) * [hopper][pytorch][build] Update hopper wheels (#270) * [habana] fix pip check and unpin werkzeug package (#271) * unpin werkzeug package * install latest version * fix rebase changes * fix pip check * revert temp config * install typing * build habana dlc * revert temp changes * release PT1.9 diy/sm (#272) * [release] adjust customer_type for diy/sm (#273) * adjust customer_type * adjust customer_type * nit change * remove neuron (#274) * add habana packages to release page (#241) * [hopper][build][pytorch] Update hopper pytorch wheels (#275) * Update hopper pytorch wheels * [hopper][build][pytorch] Update transformers wheel (#276) * [hopper][build][pytorch] Update transformers wheel * [hopper][build][pytorch] Update transformers wheel (#278) * [hopper][build][pytorch] Update transformers wheel * Disable hopper mode * Synch HF images from public (#281) Co-authored-by: Sergey Togulev * [hopper][build][pytorch] Upgrade transformers to 11.0 (#282) * Upgrade transformers to 11.0 * Update transformers version * Disable hopper mode * trigger builds * retrigger builds Co-authored-by: tejaschumbalkar * [hopper][huggingface_tensorflow][huggingface_pytorch][build][test] Build and test Hopper images with sm pysdk (#280) * Added the changes to build hopper images with sm pysdk * Added the tests to run using sm pysdk * Added debug lines * Run SM local tests and address comments * Deactivated ecs and eks tests. * Reverting the dev config changes * [test][sagemaker] Make PySDK binary selection logic generic for the SM tests and SM local tests (#283) * Make PySDK binary selection logic generic for the SM and SM local tests * Make hopper mode true * Revert the changes * [hopper][build][pytorch][tensorflow] Update fw wheels with init changes (#284) * [hopper][build] Update fw wheels with init changes * Enable test flags * Fix typo * Disable test flags * [hopper][build][pytorch] Fix Hopper DT NaN issue (#288) * Fix Hopper DT NaN issue * Update dlc_developer_config.toml Co-authored-by: pinaraws <47152339+pinaraws@users.noreply.github.com> * [hopper][build][pytorch][tensorflow] Fix licence files (#289) * [hopper] [build] [pytorch] Updating SM trcomp PT wheels for DT support (#293) * Updating SM trcomp PT wheels for DT support * Update dlc_developer_config.toml Co-authored-by: pinaraws <47152339+pinaraws@users.noreply.github.com> * [hopper][build][pytorch] Include examples dir in transformers wheel (#291) * Include examples dir in transformers wheel * Update transformers wheel * Update dlc_developer_config.toml Co-authored-by: pinaraws <47152339+pinaraws@users.noreply.github.com> * [hopper] [test] [sagemaker] Adding tests targeting the SM Training Compiler integrated containers Private master (#286) * Fix bugs in framework init functions. +new Fx Wheels for HF-trcomp Create remote and local test for HF-PT-trcomp Create remote tests for HF-TF-trcomp Make tests shorter * Added handlers for non implemented tests * Updating HF-trcomp tests to look for log messages indicating trcomp has been ingaged in the training logs * Fix for smdebug EC2 test. * Adding HF-PT-trcomp tests to test different trcomp configs. Porting testing to work with HF-TF-trcomp. * Finalizing HF-trcomp tests Fixed HF-TF-trcomp build recipe. Add redundancy to all trcomp build recipes Fixing test dependencies * Increasing retries for HF trcomp tests * Skipping HF-PT-trcomp local test since it hangs. Will fix later * Reverting test mode Co-authored-by: Sergey Togulev * [test] Fix smart retry benchmark tests (#1452) (#296) * Fix for multithreading error in SM local tests * Rollback dlc_developer_config changes * Fix for SM local tests * Rolled back dev_config changes * Fix for multithreading error in SM local tests * Rollback dlc_developer_config changes * Fix for SM local tests * Rolled back dev_config changes * Fix for smart retry benchmark tests Co-authored-by: Sergey Togulev (cherry picked from commit df440538a7c5f580301c5f3a1c56c14beab48821) Fix smart retry (#1451) * Fix for multithreading error in SM local tests * Rollback dlc_developer_config changes * Fix for SM local tests * Rolled back dev_config changes Co-authored-by: Sergey Togulev (cherry picked from commit 97fb152a7022f252d4349742cbc7d7c3bc0af9a6) [test] Smart retry functionality (#1414) * check pytest cache * enable builds * enable builds * enable builds * enable builds * disable builds * disable builds * enable builds * Added -p to mkdir * Using dinamic obj name * Added try-catches * Moved everything to separate functions * Fixed a small bug * Removed separate functions * Removed separate functions * Fixed bugs * Fixed bugs * Fixed bugs * Added tests for sagemaker * Typo fix * Added last-failed for sagemaker * Fixing sm-local tests * Removed json * updated ec2 commands * using string in threads pool instead of dict * moved to p.map again * moved to p.map again * Rolled back dev_config changes * Fixed sm-local tests * Fixed sm-local tests * Fixed sm-local tests * refactored pytest_cache.py * fixed a bug * removed code for sagemaker remote tests * rolled back dev config * A few changes after the review * A few changes after the review * Fixed a typo * Added account number parameter * Refactored utils instantiating * A few NITs Co-authored-by: Sergey Togulev (cherry picked from commit 5938a87927cbd7c4500a04a98c2d58dea82d3dad) Co-authored-by: Sergey Togulev * Fix for smart retry (#300) Co-authored-by: Sergey Togulev * [trcomp] [build] Fixing debug artifact path for trcomp (#299) * [trcomp] [build] Fixing debug artifact path for trcomp * fix: Adding additional checks to trcomp HF-PT debug tests to ensure debug artifacts are uploaded. * Reverting PR test config * [hopper][build][pytorch] Fix transformers gradient clipping issue (#304) * Fix transformers gradient clipping issue * Trigger build * Use pipeline-built transformers wheel * Update dlc_developer_config.toml Co-authored-by: pinaraws <47152339+pinaraws@users.noreply.github.com> * release_images.yml with hopper images (#306) Co-authored-by: Sergey Togulev * [release] Release trcomp (#307) * release_images.yml with hopper images * Added trcomp Co-authored-by: Sergey Togulev * [hopper][build][pytorch] Add distributed training entry point (#308) * [hopper][build][pytorch] Add distributed training entry point * Disable tests * Skipping benchmark tests for trcomp containers (#309) Co-authored-by: Sergey Togulev * [tensorflow][build][test] Tensorflow2.6 with SM PySDK keynote3 (#287) * Tensorflow2.6 with SM PySDK keynote3 * Adding leftover changes * Increase image size * Use partially complete keynote3 PySDK * Added changes to pass pr quick checks * Minor fix for sanity and quick checks * Fixing the download path * Log absolute path * Fixing the path for pr checks * Reformatted using black -l 120 * Addressed comments * Increased image size * After the latest wheel release * [config] Fix `do_build` config option (#1494) * Set do_build as false * Sync the cpu dockerfile with public master * Added the keras version pinning * Minor fix * Pinned tensorflow io * Make gpu dockerfile same as public with pinned tfio * Install new sm binaries * Added the increased sizes * Added changes for tf2.6.2 * Make image baseline 8000 * Changed the tf2.6.2 binaries to many_linux latest * Revert dlc developer config Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * Skipping sm debugger tests for trcomp containers (#310) Co-authored-by: Sergey Togulev * add graviton support (#313) * revert graviton release specs (#314) * [trcomp][build][pytorch] Fix distributed training entry point (#315) * [trcomp][build][pytorch] Fix distributed training entry point * Skipping sm debugger tests for trcomp containers Co-authored-by: Sergey Togulev Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> * [build]|[test]|[tensorflow] Made changes to build TF2.6.2 with SmPySDK and Boto (#316) * Made changes to build TF2.6.2 with SmPySDK and Boto * Revert temp chagnes * Added sanity check tests * release graviton for gamma testing (#317) * [huggingface-neuron] Update release_images.yml (#318) * Update release_images.yml (#319) * Update release_images.yml For hf neuron for the time being have disable_sm_tag to True * Update release_images.yml Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [trcomp] [pytorch] [build] Defaulting GPU_NUM_DEVICES to 1 (#321) * [trcomp] [pytorch] [build] Defaulting GPU_NUM_DEVICES to 1 * [trcomp] [pytorch] [test] Testing default value of GPU_NUM_DEVICES * Reverting PR config * Upgrade pillow in TF hopper container (#322) Co-authored-by: Sergey Togulev * Pillow fix (#323) * Upgrade pillow in TF hopper container * fixed a typo in a dockerfile Co-authored-by: Sergey Togulev * [trcomp] [pytorch] [build] Fixing CVEs (#324) * [trcomp] [pytorch] [build] Fixing CVEs * Skipping not needed frameworks * Removing hf-pt to trigger hopper tests * Trying to execute hopper tests * Skipping not needed frameworks * Fixed dependency check issues self-discovery * Addded print for debugging * [trcomp] [pytorch] [build] Fixing CVE in bokeh * Moved bokeh installation into a different block * Removed temp logging * [trcomp] [pytorch] [build] Fixing CVE in numpy and ipython * Rollback temp changes Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> Co-authored-by: Sergey Togulev * Bump tensorflow in /test/sagemaker_tests/huggingface_tensorflow/training (#295) Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.5.1 to 2.5.2. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.5.1...v2.5.2) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [trcomp] [pytorch] [build] Fixing perf issues in g4dn instances (#325) * [trcomp] [pytorch] [build] Fixing perf issues in g4dn instances * Revert PR check config Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> * [test][sanity] Removed temp changes from test runner (#327) * [trcomp] [pytorch] [build] Fixing CVEs * Skipping not needed frameworks * Removing hf-pt to trigger hopper tests * Trying to execute hopper tests * Skipping not needed frameworks * Fixed dependency check issues self-discovery * Addded print for debugging * [trcomp] [pytorch] [build] Fixing CVE in bokeh * Moved bokeh installation into a different block * Removed temp logging * Rollback temp changes * Rollback temp changes Co-authored-by: Loki Co-authored-by: Sergey Togulev * Using pypi sagemaker (#332) Co-authored-by: Sergey Togulev * Merging from PUBLIC (#333) * Merging from PUBLIC * Fixed docker login * Fixed parameter passing * Fixed import * Fixed sm_helper import * Rollback config changes Co-authored-by: Sergey Togulev * [Trcomp][CI] logic change copied from PR331 (#337) * [Trcomp][CI] logic change copied from PR331 * comment out failed dockerfile commands * revert dev config * update dev config * address comments * set dev config * fix typo * update * remove sagemaker test skip * sync with PUBLIC * remove unwanted habana test * revert dev config * remove sagemaker test skip for pytorch trcomp Co-authored-by: tejaschumbalkar * [trcomp] [pytorch] [build] Adding support for PyTorch 1.10 (#329) * [trcomp] [pytorch] [build] Adding support for PyTorch 1.10 * Setting developer config for PR validation tests * [trcomp] [pytorch] [build] Release PyTorch 1.10.0 * [trcomp] [pytorch] [build] Adding common training dependencies * [trcomp] [pytorch] [test] Changing tests to reflect changes to HF logging in 4.16.2 * [trcomp] [pytorch] [build] Adding common training dependencies * [trcomp] [pytorch] [build] Upgrading PT from 1.10.0 to 1.10.2 * [trcomp] [pytorch] [build] Adding torchaudio binaries * [trcomp] [pytorch] [build] Updating NCCL version in binaries * [trcomp] [pytorch] [test] Adding back skip markers after bad merge * [trcomp] [pytorch] [build] Updating torch version to reflect X.Y.Z+cuABC * [trcomp] [pytorch] [build] Fixing numpy version to fix dependency for package numba * fiix sanity failures * rename dockerfile * remove duplicate test skip logic * update e3 test skip logic * fix sagemaker test directory * fix sanity test * enable ec2 test run and fix smdebug test * nit change * fix framework name * fix variable name * [trcomp] [test] Removing/Replacing internal code names * [trcomp] [pytorch] [build] Fixing GPU_NUM_DEVICES issue with Distributed Training * [trcomp] [pytorch] [build] Adding support for G5 instances with A10 GPUs * Reverting developer config Co-authored-by: tejaschumbalkar Co-authored-by: Qingzi-Lan * [trcomp][build] fix the base image version for TF 2.6.3 (#331) * fix the base image version * update dev config * upgrade numpy & openssl * downgrade numpy to 1.21 * fix sanity tests * enable ec2 test * update ec2 test skip logic * update dockerfile name logic * update * update * update * fix typo * update * update * update * fix typo * skip horovod test * update * update dev config * fix sagemaker test path * update sagemaker test skip fixture * update * update dev config * revert dev config Co-authored-by: Qingzi-Lan Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * [release] release HF Trcomp TF2.6.3 & PT 1.10.2 (#338) * release HF Trcomp TF2.6.3 & PT 1.10.2 * backup previous release_images.yml * Sync eks infrastructure changes (#340) * Graviton eks infrastructure (#1579) * initial commit * add pre-deploy * add nodegroup support * modify eks buildspec * build a cluster * add kubeconfig * nit change * revert temp changes * explictly set managed node * remove managed option * add option to upgrade nodegroup * nit change * template update Co-authored-by: Ubuntu Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan * [eks] Upgrade EKS nodegroups and enable eks test for graviton (#1821) * ung * enable eks test for graviton * build image * disable config * deploy graviton nodegroups Co-authored-by: Ubuntu Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan * upgrade nodegroup (#341) * Merge from PUBLIC repo @ef69cf4 (#339) * test merge from PUBLIC * trigger test * update dev config * revert dockerfile change * change dockerfile * update utils * debug modified dockerfile regexp * debug github handler file changed * revert debug info, and force to_build to true * enable habana build * fix merge error * restore files from PUBLIC * revert dev config and "changeset limited to 20files" work around * [build] Find buildspecs using configured env vars (#366) * [pytorch][build] Remove patch version from buildspec file name (#376) * Sync from public repo (#387) * release pt-1.10.0 (#1616) * release pt-1.10.0 Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [huggingface_pytorch][NEURON][build] Huggingface Neuron inference DLC (#1578) Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Venky Natham * [build][graviton][mxnet][pytorch] fix graviton image build (#1618) * fix graviton image build * revert dev config * Run dependency check on HF neuron images (#1622) * [tensorflow][test][benchmark] Makeshift fix for flaky benchmark tests (#1575) * Makeshift fix for flaky benchmark tests * Shifted the if condition * Reverting change * Removing unnecessary import * reverting temp changes * Add support for multistage dockerfiles for e3/sagemaker (#1532) * Exclude dependency check library from tool (#1611) * [MXNet][build][test] Release MX 1.9.0 inference & training binaries (#1217) Co-authored-by: Sai Parthasarathy Miduthuri Co-authored-by: Wei Chu Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * Update release images for MX1.9 (#1639) * Run MX sagemaker benchmarks on SM images (#1640) * [test][sagemaker]Sm remote smart retry (#1573) * Refactored mxnet sm multi-region tests * Rollback devconfig changes * Update SM smart retry * converting custom_cache_directory to string * converting custom_cache_directory to string * converting custom_cache_directory to string * upload cache to s3 * upload cache to s3 * upload cache to s3 * upload cache to s3 * upload cache to s3 * added broken test * added broken and working tests * added broken and working tests * added broken and working tests * Fixed bug * Fixed bug * Revert temp changes * Fixed bug * Rolled back temp changes * Added a few comments * A few edits after review * Rolled-back temp changes Co-authored-by: Sergey Togulev * [doc] Added NVIDIA Triton inference containers to available images (#1591) * [NEURON][TEST] - Update the manifest for 1.17.0 release (#1632) * [neuron][huggingface] Update MMS version in HF Neuron DLCs (#1644) * support py38 in MX sagemaker tests (#1652) * Update MX 1.9 example images (#1654) * Update numpy version in MX images (#1656) * Pin numpy to <1.20 in MX 1.9 images (#1657) * Pin numpy to <1.20 in MX 1.9 images * update buildspec * Habana Synapseai v1.2.0 dockerfiles (#1627) * Habana 1.1.1 release update * Update docker image path to 1.1.1 release docker * Added 1.9.1 pytorch * Added 2.7.0 tensorflow * Turn on habana_mode=true * update framework binaries * update dockerfile to py38+ul20 * Fix Pytorch docker container path * update license files * Update 1.2.0 links * update binaries for PT1.10 * update pt binaries * remove pytorch_binary from buildspec * Remove dataclass/typing workaround from previous releases * fix few build failures * Unpin Pillow package and fix dataclass/typing on 2.7 instead of 2.5 * unpin request * allow openssl cve * update tf wheel with tensorflow-cpu * fix security issue * nit change * revert developer config Co-authored-by: Wei Chu Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [NEURON][BUILD][MX] - update to sdk1.17.0 (#1636) * [NEURON][BUILD][TF2.5] - update to use sdk1.17.0 and also tf2.5.2 (#1635) * Release MX inference images for MXNet 1.9 (#1662) * update availabel_images.md for MX1.9 (#1655) * [NEURON][BUILD][PT] - move to sdk1.17.0 and also use pytorch 1.10.1 (#1634) * [NEURON][RELEASE] - Update yml file to add PT1.10.1 and TF2.5.2 (#1668) * Relase Neuron Images for sdk1.16.0 Release PT1.9.1, TF1.15.5, Tf2.5.1, MX:1.8.0 Signed-off-by: Venky Natham * don't look for sm tag Signed-off-by: Venky Natham * Add neuron release 1.16.1 version Signed-off-by: Venky Natham * add neuron release 1.16.1 Signed-off-by: Venky Natham * update available images for neuron Signed-off-by: Venky Natham * fix md file to have py37 for pt Signed-off-by: Venky Natham * add old neuron versions Signed-off-by: Venky Natham * Release PT1.10.1 and TF2.5.2 Neuron DLC Signed-off-by: Venky Natham * add to release_images.yml Signed-off-by: Venky Natham * add mxnet Signed-off-by: Venky Natham * Update release_images.yml * Update .release_images_template.yml Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [NEURON][BUILD][TF] - Upgrade tf1.15.5 to use the neuron sdk 1.17.0 (#1642) * Release neuron sdk1.17.0 version of tf1.15.5 dlc (#1673) * Relase Neuron Images for sdk1.16.0 Release PT1.9.1, TF1.15.5, Tf2.5.1, MX:1.8.0 Signed-off-by: Venky Natham * don't look for sm tag Signed-off-by: Venky Natham * Add neuron release 1.16.1 version Signed-off-by: Venky Natham * add neuron release 1.16.1 Signed-off-by: Venky Natham * update available images for neuron Signed-off-by: Venky Natham * fix md file to have py37 for pt Signed-off-by: Venky Natham * add old neuron versions Signed-off-by: Venky Natham * Release PT1.10.1 and TF2.5.2 Neuron DLC Signed-off-by: Venky Natham * add to release_images.yml Signed-off-by: Venky Natham * add mxnet Signed-off-by: Venky Natham * Update release_images.yml * Update .release_images_template.yml * release neuron sdk 1.17.0 version of tf1.15.5 Signed-off-by: Venky Natham Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * tensorflow_serving 2.8 e3 inference container (#1671) * add wip dockerfiles * add tensorflow_model_server * update ci instructions * update tensorrt * change pyversion in buidlpsec;rm files in /tmp for stray_file_test * update cve allow list * revert dev config * udpate tmp file delete * revert dev config, add tf27 buildspec Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> Co-authored-by: Qingzi-Lan Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * Add e3 dockerfiles for tensorflow 2.8 (#1647) * add tf28 e3 container dockerfiles * update buildspec * use numpy tensorflow dependency * update buildspec to reflect change of python version * Update buildspec.yml * install cudnn-dev * update cudnn * fix typo * enable safety scan * update horovod installation * A few security upgrades * upgrade pillow to 9.0.1 * urllib3 to the latest * ignore numpy false positive vulnerability * Fixed urllib constrain * Skipped a couple of safety tests * Turn off safety scan * update wheel * remove tempory pem file * revert dev config * set dev config with safety check * revert dev config Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> Co-authored-by: Sergey Togulev Co-authored-by: Qingzi-Lan Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * Bump tensorflow in /test/sagemaker_tests/huggingface_tensorflow/training (#1677) Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.5.2 to 2.5.3. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.5.2...v2.5.3) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * use js_import instead of js_include for TF serving nginx configuration (#1666) * tf2.7 inf build * fix buildspec * nginx configuration * revert config * remove - * rename tfs file * nginx configuration * remove js_content * manage export statement * fix nginx errors * Enabling safety test * revert temp changes * address comments * nit change * change file name * adjust file name * nit change * enable inference build * revert buildspecfile changes Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> * [Habana]|[Builld]|[Test] Enable Safety Scan Ignore list for Habana numpy issues (#1678) * Enable Safety Scan Ignore list for Habana numpy issues * Changed the ignore messages * Reverted developer config changes Co-authored-by: Shantanu Tripathi * add TF2.8 in release images (#1676) * Release neuron sdk 1.17.0 tf1.15.5 (#1681) * Relase Neuron Images for sdk1.16.0 Release PT1.9.1, TF1.15.5, Tf2.5.1, MX:1.8.0 Signed-off-by: Venky Natham * don't look for sm tag Signed-off-by: Venky Natham * Add neuron release 1.16.1 version Signed-off-by: Venky Natham * add neuron release 1.16.1 Signed-off-by: Venky Natham * update available images for neuron Signed-off-by: Venky Natham * fix md file to have py37 for pt Signed-off-by: Venky Natham * add old neuron versions Signed-off-by: Venky Natham * Release PT1.10.1 and TF2.5.2 Neuron DLC Signed-off-by: Venky Natham * add to release_images.yml Signed-off-by: Venky Natham * add mxnet Signed-off-by: Venky Natham * Update release_images.yml * Update .release_images_template.yml * release neuron sdk 1.17.0 version of tf1.15.5 Signed-off-by: Venky Natham * release neuron sdk 1.17.0 based tf1.15.5 Signed-off-by: Venky Natham Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * Add support for sagemaker-like E3 tag (#1672) * [canary] Update python versions for TF canaries (#1682) * fix TF tests issues (#1684) * Habana DLC Perf/TestSuite TF/PT tests -- gaudi test suite (#1567) * Habana DLC Perf/TestSuite TF/PT tests * Add Habana DLAMI Tensorflow Performance Benchmarks * Add Habana DLAMI PyTorch Performance Benchmarks * Add Habana DLAMI Tensorflow Test Suite * Add Habana DLAMI PyTorch Test Suite * Apply gaudi-test-suite to test bert, rn50, maskrcnn, framework, etc. * Test cleanup and exit code fix * Fix gaudi-test-suite branch name * To extract the Throughput correctly * Update scripts for 1.2.0 release * Add tf requirement installation * Remove comments * fix test scripts * enable habana mode * configure git creds * build habana images * adjust test dir * run benchmark tests * fix docker command * update pt binary * build new image * use dedicated github granch * nit change * pin pt setuptools * pin setuptools * fix log file * fix benchmark test * awscli support * fix dep check * nit changes * run benchmark test * adjust pytest timeout for habana * turn off benchmark mode * add habana fixture * run benchmark test * increase timeout * revert temp config * increase timeout to 5hr * build image * run benchmark test * revert temp config Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> Co-authored-by: Buke Ao Co-authored-by: Anny Chung Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> Co-authored-by: Wei Chu * change cudnn version for tf2.8 for compatibility with p2 instances (#1688) * update cudnn version * update buildspec * test on p2 instance * revert dev config Co-authored-by: Qingzi-Lan * Habana release v1.2 images for TF and PT (#1687) * release v1.2 * nit * habana release v1.2 (#1691) * Bump numpy in /test/sagemaker_tests/pytorch/inference (#1679) Bumps [numpy](https://github.com/numpy/numpy) from 1.16.4 to 1.21.0. - [Release notes](https://github.com/numpy/numpy/releases) - [Changelog](https://github.com/numpy/numpy/blob/main/doc/HOWTO_RELEASE.rst.txt) - [Commits](https://github.com/numpy/numpy/compare/v1.16.4...v1.21.0) --- updated-dependencies: - dependency-name: numpy dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [NEURON][BUILD][HF] - move hf neuron dlc to use latest sdk (#1669) * [NEURON][BUILD][HF] - use ubuntu18 (#1700) * use ubuntu18 Signed-off-by: Venky Natham * enable test Signed-off-by: Venky Natham * remove libtinfo6 install as that is specific to u20 Signed-off-by: Venky Natham * Update dlc_developer_config.toml Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [NEURON][BUILD][TF] - Move tf2.5.2 neuron to sdk 1.17.1 (#1696) * [NEURON][BUILD][MX] - Move to neuron sdk1.17.1 (#1698) * [NEURON][BUILD][PT] - Move pt1.10 to neuron sdk1.17.1 (#1699) * [NEURON][BUILD][TF] - Move tf1.15.5 to use neuron sdk 1.17.1 (#1697) * Release neuron sdk 1.17.1 version (#1702) Signed-off-by: Venky Natham * [doc] Update available images for neuron sdk release 1.17.1 (#1703) * Add release images definition for HF PyTorch Neuron (#1694) * [PyTorch E3] PT 1.10.2 DLC release (#1683) * pt1.10.2 * add dgl * update vision binaries * update numpy and pillow versions * fix numpy 1.22.0 installation * update versions for cpu * pin ipython version * fix ipython installation * update dgl pt container tests * config for e3 only * pin numpy version * skip CVE 44463 * fix format * update dev config * update dev config * disable dgl * disable dgl cpu test for eks * revert graviton changes * revert sagemaker wheel * remove pt1.10.0 buildspec * revert dev config * Update dlc_developer_config.toml Co-authored-by: Wei Chu Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * Add tf2.7 training sagemaker dockerfiles (#1628) * add tf2.7 sagemaker dockerfiles * update buidlspec * remove non-compatible python packages * add dependencies for kebros * use manylinux wheels; add sagemaker dockerfiles * update horovod installation env vars * update horovod installation script * use numpy as tensorflow dep * Update buildspec.yml * install boost * increase image size limit * update pillow and add docker lables * use wheels from smdebuggers pipelines * fix sanity test * add labels for tf 2.7 sm cpu * rerun * build+rerun * reinstall horovod cpu * install smdebug directly from tag * fix typo; * Revert "fix typo;" This reverts commit c5bd300d2141a91ac4f3f1d6d13711aa975370cb. * Revert "install smdebug directly from tag" This reverts commit c51ef6b95b20de6f65397f34a29806ab77c03461. * Executing safety check in PR * install smdebug directly from the branch * bump up tensorflow to 2.7.1 * install higher version of tensorflow-io to avoid overriding tensorflow * Ignoring a false positive vulnerability * install tfds * change pytest comands * do not install dependencies as they have been installed in the dockerfiles * add SAGEMAKER_TRAINING_MODULE environment variable * remove pem file in tmp folder * update sagemaker-tensorflow * add smdataparallel * revert rm /tmp * remove /tmp/git-secrets * experiment with an smdebug fix * Revert "experiment with an smdebug fix" This reverts commit b19ee8347ed6208ff9c2ac81d489dba785632199. * skip test_keras_mirrored.py * fix error in buidlspec * Revert "fix error in buidlspec" This reverts commit b973fa415e324a6f63d1fb816b22848e35600934. * revert developer_config * fix buildspec * fix buildspec * fix py version * revert buildspec to mainline Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: tejaschumbalkar Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> Co-authored-by: Sergey Togulev Co-authored-by: Qingzi-Lan Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * pt1.10.2 release images (#1706) * pt1.10.2 release images * add example * TF2.8: Clean up dockerfiles, update HVD test (#1693) * update pt1.10.2 release images (#1707) * update pt1.10.2 release images * Update release_images.yml Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [Build][tensorflow] fix TF27 GPU CVE-2022-24407 (#1710) * test * update * udpate * update * should fail * test cpu and gpu * update gpu sasl package * udpate libsasl manually * update * add TF27 release images (#1714) * [Tensorflow] add comment on py39 installation on TF 2.8 dockerfiles (#1715) * document TF28 dockerfile * update * Release TF2.8 e3 images (#1716) Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * [Tensorflow][Test][ec2] Fix Habana Tensorflow EC2 tests (#1704) * Changed dev config to build images * Added safety check test true * Changed the build to true in buildspec * Add logic to upload and read from s3 with a break statement * Remove break and fix tail bug * Change loop time and last line of script * Added modularity * Removing unwanted logs * Modifying the while loop to check if the test can end early * Reformatting the code * Fixing bugs and refactoring * Minor fix * refactored code and added buckets for each account * Refactored to include the ValueError within execute_async method * Implemented bucket logic * Reverting temp changes Co-authored-by: Shantanu Tripathi * bug fix (#1717) * re-releas TF27 sagemaker cpu training (#1720) * [build][pytorch] pt1.10 add openssh support (#1619) * [tensorflow] Bug fixes to TF2.8 E3 images (#1723) * [tensorflow] Bug fixes to TF2.8 E3 images * add sasl install * upgrade sasl instead of reinstalling * Revert "upgrade sasl instead of reinstalling" This reverts commit 51eb07408a404edde16e5bb2ddb3aa3b782a37a7. * [Habana] [test] [ec2, sagemaker] Fix to skip SM tests for Habana and modify async testing API (#1724) * Fix to skip SM tests for Habana and modify async testing API * Added the hang detection window variable * Revert developer config Co-authored-by: Shantanu Tripathi * Move sasl to upgrade instead of install (#1726) * Add dependabot config file to scan Dockerfiles (#1727) * Add dependabot config file to scan Dockerfiles * Update dependabot.yml * [PyTorch] PyTorch 1.10.2 SageMaker DLC (#1709) * pt1.10.2 sm dlc * merge from upstream master * refactor smdebug installation * set enable_test_promotion:false for e3 Co-authored-by: Wei Chu * Configured release_images.yml for TF2.8e3 re-release and PT1.10.2 SM release (#1737) * Configured release_images.yml for TF2.8e3 re-release * Update release_images.yml * Add Pytorch release changes to the yml Co-authored-by: Shantanu Tripathi Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [build][pytorch] pytorch 1.9 add openssh support (#1621) * add openssh support * build training image only * revert dev config * update * update package version * udpate * revert dev config * [tensorflow] Add dockerfiles for TF2.8 (#1685) * add sagemaker dockerfiles * update developer config * update buildspec * fix typos * fix typo for python version * add smdebug * add sagemaker-tensorflow * add smdataparallel * remove tmp files * update test config * remove wrong ldlib path * update tensorflow-io version` * remove sagemaker-tensorflow til py39 pkg become available * remove sagemaker-tensorflow * add sagemaker-tensorflow * install sagemaker-tensorflow from source * install tfds * do not install tesnorflow-dataset in the tests as it was installed in the image * set datetime_tag to false * correct python version * update buildspec * pass arguments related to python to e3 and sagemaker stages as env vars * install smdebug from the tag * minor update for sagemaker-tensorflow installation * bug fix * Changes to config file * Make fix for cyrus CVE * Change configs file to disable safety_check_test * bump up requests * run benchmark without rebuild * run sagemaker rc tests * run efa tests * unistall tfds as it is installed in the image already * run rc tests * remove unused env vars * fix license * update buildspec to build sagemaker images only * Revert "update buildspec to build sagemaker images only" This reverts commit 908c89dcec178fe964346516cf12f52b6448868d. * skip safty checks * remove license from sagemaker stage * revert dlc_developer_config.toml * remove unused comments * skip test_keras_mirrored for TF2.7 * fix styling issues * add env var for TF version * comment out e3 and example images build Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Shantanu Tripathi Co-authored-by: Shantanu Tripathi * Update available images for TF2.7 SM and TF2.8 E3 (#1741) * [TensorFlow] bump up tensorflow to 2.6.3 (#1721) * [TensorFlow] add sagmaker dockerfiles for tensorflow_serving (#1689) * add sagmaker dockerfiles * build sagemaker image * pass build args as env variables to sagemaker stages, remove unused dockerfle Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri * [tensorflow] [build] [test] TF2.8 SM image fix (#1748) * TF2.8 SM image fix * Rebuild images with new SMDebug tag released * Changed the smdebug versioning format * Removed additional code for skipping tests * Change buildspec and revert temp changes * Added newline at ends of buildspecs * [tensorflow][build][sagemaker] enhance gunicorn logging (#1750) * [autogluon][build] AutoGluon 0.3.1 container patching (#1734) Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> * [autogluon][build] AutoGluon 0.3.2 container (AG 0.3.1 with patched images) (#1752) * [release] Add TF 2.8 SM DLCs to release images (#1755) * [doc] Update available images (#1754) * [release] Add AG 0.3.2 images to release (#1757) * [Tensorflow][Test][benchmark][ec2] Invoke all the Habana benchmark tests using async execution (#1711) * Basic config for building images and running the tests * Added timeout for benchmark runs * Invoking async execution for the benchmark test * Added uuid to logs and increased loop time * Added background process into ec2 for uploading logs * Making the function background connection.run async * Reverted to connection run logic * Shift to background process execution without breaking in case of no progress * Changes with different filenames * Minor fix * Run all habana benchmark tests asynchronously * Run all tf tests * Removed PT from being skipped * Made minor changes * Removed comments * Reverting changes in execute_asynchronus_testing_using_s3_bucket * Revert testrunner * Added minor docstring * Revert buildpsecs * Reverted all temp changes Co-authored-by: Shantanu Tripathi * [tensorflow][build][test] Fix test failures for TF 2.6 dockerfiles (#1756) * [huggingface_pytorch] upgrade Pytorch to 1.10 (#1630) Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Venky Natham Co-authored-by: Sai Parthasarathy Miduthuri Co-authored-by: Rahul Huilgol * [release] Remove customer_type from AG spec (#1758) * [autogluon][test][sagemaker] Add support for py38 version in tests (#1760) * [pytorch] [build] [E3] PyTorch 1.11.0 (#1719) * [Habana]|[Build]|[Test] Fix Habana PT benchmark issues due to framework version regex match (#1742) * Fix Habana PT benchmark issues due to framework version regex match * Revert temp changes * [pytorch] [build] [E3] Scope multistage ARGs globally in PyTorch 1.11.0 (#1762) * [release] Add TF 2.6.3 and HF PT 1.10.2 (#1766) * [release] Re-release TF2.7 inference images as TF2.7.1 (#1767) * Habana DLC 1.3.0 Release (derived from PR #1692) (#1722) * first version of 1.3.0 * fix mistakes * habana mode enabled * fix PT BERT requirements with new pip * update framework binaries * keep torchvision * Empty Commit * Running safety test and disabling datetiem tag as an experiment * remove some tools for TF * Make changes to asyn test execution * Unpin absl-py * Run benchmark tests * Empty commit to rerun tests * Add fix for PT tests * Enable datetime tag to ensure new image is made * Changed PT dockerfile to use the dlc.py from a specific commit * Add COPY command to copy the dlc_container file * Increased timeout limit * Changed hang detection window to 30 * Change Habana DLAMI for tests * Allow building images * Rerun tests with reduced window * Check ec2 tests * Rerun the sanity tests * Reformatting to add enable_habana_async_execution * Making exception for PT1.10.1 * Rebuilding image to see if the openssl installed is 1.1.1f-1ubuntu2.12 * Added pytorch lightning additional vulnerability to ignore list * TF2.8 hpu images skip openssl issue * Undo PT fix changes * Revert temp changes * Responded to nit comments Co-authored-by: Buke Ao Co-authored-by: Omri Almog Co-authored-by: Wei Chu Co-authored-by: omrialmog Co-authored-by: Shantanu Tripathi * [huggingface_tensorflow] upgrade Tensorflow to 2.6 (#1629) Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Qingzi-Lan * [autogluon][build] AutoGluon 0.4.0 container (#1759) * [ec2][test] Fix PT 1.11 test issues (#1772) * [huggingface_tensorflow][build] Fix TF 2.6.3 binary source in dockerfile (#1771) * [huggingface_tensorflow] Fix TF 2.6.3 binary source in dockerfile * Run local tests * update ignore safety ids * set to RC test * fix typo * revert dev config Co-authored-by: Qingzi-Lan Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * [Release][Tensorflow] release HF TF 2.6.3 (#1775) * [release] Add PT 1.11.0 to release_images (#1776) * [Habana] Get commit using env variables (#1778) * Get commit using env variables * Revert temp changes and respond to nit * [doc] Add AutoGluon 0.3.2 to available images (#1761) * [Habana] Fixing license file for Tensorflow (#1781) * Habana: Fixing license file for Tensorflow * Disabling reruns for Habana tests * Revert the temp changes * Habana PT1.10.1 and TF2.8.0 release (#1784) * Habana PT1.10.1 and Tf2.8.0 release * Removed older releases * [pytorch][build] upgrade ts to torchserve-nightly==2022.3.23.post in PT1.10 (#1779) Co-authored-by: Sai Parthasarathy Miduthuri * [mxnet][build] Update mx1.9.0 cpu binaries (#1773) Co-authored-by: Wei Chu Co-authored-by: kevinyang8 <40340762+kevinyang8@users.noreply.github.com> * [release] Add AutoGluon 0.4 DLCs to release images (#1786) * [Autogluon][doc] AutoGluon 0.4.0 available images update (#1789) * Add MXNet 1.9 CPU to release images (#1792) * Add MXNet 1.9 CPU to release images * add sagemaker for mxnet release images * disable force release Co-authored-by: Kevin Yang * [eks][test] EKS cleanup workflow (#1788) * add cron files * adjust function caller * reorganize files * add loggers * nit change * reformat * update dockerfile * remove unwanted comments * spell check * fix import (#1793) * [pytorch][1.10][training] fix typo in mpirun path (#1774) * disable part test (#1798) * [PyTorch][1.10] Fix SM Local tests (#1797) Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> * Reduced the default TRAINING_TEST for benchmark tests. (#1785) * Reduced the default TRAINING_TEST for benchmark tests. * Adding config file * Running ec2 tests * Revert temp changes Co-authored-by: Shantanu Tripathi * [pytorch] [build] PT1.11.0 - reduce PyTorch GPU Train image size (#1783) * move torchnet to torch package installations * collapse torch package installation commands * change buildspec to build train E3 GPU only * update torch packages installation commands in all dockerfiles * update buildspec to build all E3 images * add torchnet installation * [pytorch][build][E3] Update PyTorch 1.11.0 cuda 11.3 Dockerfiles (#1795) * change GPU train cuda 115 to 113 * update buildspec to build e3 gpu train only * edit dockerfile path to cu113 * update inference GPU from cu115 to cu113 * update buildspec to build all e3 images with cu11.3 * save cu115 for reference purpose * update PyTorch installation lines to reduce image size * update package versions on GPU images * build E3GPU image only * update OFI version from 1.1.3-aws to 1.2.0-aws * update buildspec to build all e3 images * update EFA to 1.15.1 to upgrade libfabric * update EFA installation strategy for dependencies * fix openmpi path as #1787 * fix sagemaker section buildspec and gpu train * un-pin libcudnn8 as it's backward compatible * fix mpi command in sagemaker image * add apt-get clean and rm apt/lists/* for all apt-get commands * save buildspec for pt1.11.0-cu115 * update image baseline sizes in buildspec Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * re-release PT1.10.2 (#1807) * enable overwritable MMS parameter (#1806) Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [test] Add tests for sagemaker environment variables (#1810) * Add SM env variable tests * update dev toml * Update dlc_developer_config.toml * [Pytorch][build] update OFI and EFA version (#1804) * udpate OFI and EFA version * install packages before clean apt cache * update efa installation * address comments * fix typo * revert dev config * [Doc]Update HF Trcomp image uri (#1814) * Update HF Trcomp iamge URI * seperate current and previous images * [tensorflow][inference] Add test to validate TF serving versioning (#1809) * update PT1.11 image uri in available_images (#1817) * Release sagemaker model parallel binary for HF DLC (#1800) * update smp binary * enable tests * revert testing changes * update smmpmp binary * update smmp binary Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: haohanchen-yagao Co-authored-by: Junpu Fan * [sanity] Update sanity tests to use separate prefixes (#1819) * Update sanity tests to use separate prefixes * Update __init__.py * Graviton eks infrastructure (#1579) * initial commit * add pre-deploy * add nodegroup support * modify eks buildspec * build a cluster * add kubeconfig * nit change * revert temp changes * explictly set managed node * remove managed option * add option to upgrade nodegroup * nit change * template update Co-authored-by: Ubuntu Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan * [build]|[test] Adds Web scraper Module and handles automations related to ECR scan. (#1241) * Adds rudimentary crawler over cve urls * Processes rows and takes care of packages * Added comments and changed the string processing logic * Added yield to store the data in 1 json by directly using the scrapy command * Added extra info for pathc key and included comments * Fixes the issue of having extra commas * Preserve the order * Adds README * Adds more information to README * Added Environment Variable and note table processing * Changed Readme * Minor README changes * Minor README changes * Added settings to store json beautifully * Added test script to call the crawler from script * Removes the dependency on Environment Variable, Uses Script to call Spider * Added code to store data in user specified file * Rudimentary commit to integrate web scraper with Ecr test workflow * Added scrapy and crochet in requirements.txt * Fixes * Added apt upgrade working * Added the functionality for conducting union. * Added the file saving mechanism * Adding dummy allowlist * Changes to allowlist * Added s3 functionality * Made the changes required to send data to ECR-AS lambda * Added the functionality to invoke the lambda * Changed key names * Added medium threshold setter * Added apt upgrade lists * Logic for changing apt-upgrade-list.txt * Fix multiple commits by custom sorting * Adding modifications for inference files * Change buildspec file * Changes location of upgrade list * Just build cpu * Added context for apt-upgrade-list to buildspec * Retrigger builds * Refactoring and adding code for allowlist update * Logged vulnerability lists in s3 * Added wait time on scraper * Rerun the tests * Adding more details in case of apt update failure * Descriptive update and upgrade failures * Added hide and warn as true in update/upgrade * Added multiple retries for apt upgrade * Added Logging in security check * Added extra info during logs and renamed scraper runner * Ran black on files * Refactored test_security_check to save the ecr_image_vulnerability_list as well * Removed scraped_data.json and prevented example images from allowlist test * Added allowlist for Training Dockerfile cpu * Add allowlists for train and inf mxnet 1.8 images * Revert neuron changes * Fix the scraper to fetch the packages * Partially responded comments * Refactore get_new_image_uri method * Refactored s3 bucket and set apt_ran_successfully to false * Removing MINIMUM_SEV_THRESHOLD and running formatter * Refactored the test_security_check to move code to security.py * Added apt-upgrade-list functionality to scale to all the processor types * Added comments and rectified indentation * Rectified dictionary operations * Extend Neuron Coverage * Changed developer config * Partial refactoring on security.py * Refactoring get_sorted_vulnerability_list * Refactored security.py * Removed fetch_other_vulnerability_lists from test_security_check.py * Changed trshanta-bucket to ecr-scan-helper * Deployed new lambda and ecr repo * Running after giving access to lambda * Addressed feedbacks related to repo and bucket names * Moved files to neuron sdk1.17.1 * Replacing original buildspec with buildspec 1.8 yml * Build neuron changes after merge * Added package mapping * Change mxnet neuron allowlist * Non neuron mode * dev config just run the tests * Allow single lambda invocation per day * Revert temp changes and buildspec changes * Fix quick pr issues * Fix quick pr issues crochet * Shift web scraper imports to required functions * [eks] Upgrade EKS nodegroups and enable eks test for graviton (#1821) * ung * enable eks test for graviton * build image * disable config * [test] Add a common file to ignore vulnerabilities affecting all containers (#1822) * Add a common allowlist file to ignore high level vulnerabilities that affect all DLCs * Reverting temp changes * [huggingface_pytorch, huggingface_tensorflow] Add support for speech & vision (#1777) * add ffmpeg * add tests and audio file * add ignore again for model files * fixed image path * fix install * fix serializer import * fixed audio naming * added inference toolkit v2 * removed remote package * changed instance type to ml.p3.2xlarge * bumped numpy for cve * Update Dockerfile.cpu * bumpy numpy to numpy>=1.22.3 * revert numpy version bump * udpate cve whitelist * update cve whitelist * update package checking * revert image.py * build inf image only * use HuggingFaceModel class * udpate image size * udpate CVE whitelist * update CVE whitelist * update CVE whitelist * added exception for TF and automatic-speech-recognition pipeline * revert dev config Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Qingzi-Lan Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * [Triton][docs] Added NVIDIA Triton CPU image (#1803) * [test] Remove redundant scans on the canary (#1829) * Remove redundant scans on the canary * Minor fixes * Revert temp changes * Release sagemaker model parallel binary for DLC (#1801) * update smp binary * enable test; change build spec * update the HF version * change for efa test * update smmpmp binary * update smmp binary * fix repeated test name * Revert "fix repeated test name" This reverts commit 6a46408c823f273275342bb8f1413fb7a34d8215. * revert dlc_developer_config.toml and buildspec.yml Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: haohanchen-yagao Co-authored-by: Junpu Fan Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * Update available images markdown file (#1832) * [test] Refactor test_security_check (#1830) * [test] Remove install from repo.anaconda.com for PT 1.11 (E3) images (#1812) * remove install from repo.anaconda.com from PT 1.11 images * update training images and developer config file * add sanity test to check repo.anaconda.com present in image * fix env variable * remove CONDA_PREFIX environment variable due to problems with mamba installation * Update dlc_developer_config.toml * Update test_anaconda.py * update anaconda sanity test * update anaconda test: * disable build in toml * address comments for test and update black version * add mamba version arg * fix env var * fix mamba arg * remove whitespace and make mamba version arg consistent * add to runtime error message * revert developer config toml * Update test/dlc_tests/sanity/test_anaconda.py Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> * set datetime to true Co-authored-by: Kevin Yang Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> * [bug] Fix bug in canary pull logic (#1827) * [bug] Fix bug in canary pull logic * Update __init__.py * Update __init__.py * Update __init__.py * Fix e3 image adding logic (#1834) * [test] Restrict image coverage for allowlist feature (#1833) * Restrict image coverage for allowlist feature * Use specifierset * Add libopenmpt to apt_package_name * Revert temp changes * refactoring nit * [canary] Update canary regex (#1835) * [canary] Update canary regex * remove temporary canary * Update dlc_developer_config.toml * [release] Release HF image updates from #1777 (#1836) * add hf tf * add hf pytorch * Enable MXNet builds and fix syntax warning in test_utils (#1838) * Fix syntax warning in test_utils * Enable MXNet builds * more syntax fix * Update test_pre_release.py * [release] Release MXNet 1.9 E3 images (#1842) * [release] Empty release_images.yml * release MX 1.9 containers * Add example images * [test] Fix canary failures for gpu images (#1837) * Fix canary failures for gpu images * Change the logic and run the tests * Remove image digest * Using pre-built functions * Revert temp changes * [huggingface_pytorch] Update MMS version for PT1.9 to enable logs again (#1823) * update mms version * update mms version * Update dlc_developer_config.toml * run rc tests * Update dlc_developer_config.toml Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [test] Fix sorted package order for apt-upgrade-list (#1854) * Fix package order for apt-list * Add temp config file * Revert temp changes * [build] Rectify apt errors arising due to nvidia key rotation (#1855) * Rectify apt errors arising due to nvidia key rotation * getting new key * Minor fix * Another fix from the nvidia forums * Fetch keys * Revert temp changes * [pytorch][build][SM] Update PyTorch 1.11.0 cuda 11.3 Dockerfiles (#1799) * change GPU train cuda 115 to 113 * update buildspec to build e3 gpu train only * edit dockerfile path to cu113 * update inference GPU from cu115 to cu113 * update buildspec to build all e3 images with cu11.3 * save cu115 for reference purpose * update PyTorch installation lines to reduce image size * update package versions on GPU images * build E3GPU image only * update OFI version from 1.1.3-aws to 1.2.0-aws * update buildspec to build all e3 images * update EFA to 1.15.1 to upgrade libfabric * update EFA installation strategy for dependencies * fix openmpi path as #1787 * fix sagemaker section buildspec and gpu train * build all images * un-pin libcudnn8 as it's backward compatible * fix mpi command in sagemaker image * add apt-get clean and rm apt/lists/* for all apt-get commands * udpate SM gpu binary link * save buildspec for pt1.11.0-cu115 * install smdebug from source for 1.0.15 * remove wrong pytorch installation command in cu115 sm * update image baseline sizes in buildspec * install smdebug 1.0.15 from source in sm cpu image * disable tests for e3 images * update sm cpu binary link * Revision A: sagemaker standard + sagemaker local * disable datatime_tag * specify cudnn version and not apt-get upgrade for EFA * do_build = false * update DGL sagemaker gcn.py * skip all frameworks except pytorch * skip fastai tests for PT1.11 * Revision: sagemaker standard * Revision: sagemaker rc * Revision: sagemaker efa * update smd data parallel binary url * update SMD binary links format * update developer config, revision A: sagemaker standard * revision B: sagemaker rc * revision C: sagemaker efa * revision: sagemaker rc + local * revision: sagemaker rc + local; do_build=false * revision: sagemaker standard; do_build=false * revision: sagemaker efa; do_build=false * do_build inference images * update torchserve to 0.5.3 * update to install torchserve and torch-model-archiver nightly 2022.3.23 * update python version to 3.8.13, to resolve CVE issue * build all sagemaker images * enable datetime_tag; sagemaker standard test * remove base_iamge_name for sagemaker images * rename telemetry env test from training to inference * do_build=false * Install smdebug 1.0.16 from source * install torch-model-archiver stable instead of nightly * do_build=true and sagemaker rc tests * do_build = false * revision: sagemaker standard * revision: sagemaker rc * revision: sagemaker efa + local * datetime_tag=false; do_build=true * revision: sagemaker standard * revision: sagemaker rc * do_build=false * revision: sagemaker efa + local * update smdmp binary url * do_build=true; revision: sagemaker standard * do_build=false; revision: sagemaker rc + local * revision: sagemaker efa * update horovod to version 0.24.3 * do_buld=true; revision: sagemaker standard * do_build=false; revision: sagemaker rc * revision: sagemaker efa + local * update PT 1.11 cu115 smd binary arg format * update PT_E3_INFERENCE_URL to PT_INFERENCE_URL * sagemaker standard tests * update sagemaker dgl test * update sagemaker dgl test * update sagemaker test_dgl to use gcn.py * update gpu arg parse for sagemaker dgl gcn test * update SMDATAPARALLEL_BINARY to SMD_DATA_PARALLEL_URL * do_build=true; sagemaker standard * do_build=false; sagemaker rc * update dlc_major_version to 2 * update comment on smdebug source install * do_build=true: sagemaker standard * Enable ec2 tests for SM since cuda version differs for e3 and sm images * Only cu113 e3 images should have v2 * Change major versions in training dockerfile cu113 * Fix Nvidia key rotaion errors * Fxing for inference images * Revert temp changes * Changed keys to Ubuntu20.04 * Implement another NVIDIA key roation solution * Revert temp changes Co-authored-by: Zeeshan Ashraf Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Shantanu Tripathi * [test] Fix locally missing images in canary (#1856) * Fix locally missing images in canary * Disable image deletion * Revert temp changes * Release PT1.11 SM CPU/GPU TRAIN/INF images (#1860) * Modified available images to add PT1.11 SM images (#1861) * empty release images (#1862) * [release] Add TF 2.8 inference DLCs to release (#1864) * [NEURON][BUILD] - PT1.10, TF2.5 to sdk1.19.0 and mx1.8 to sdk1.18.0 (#1859) * PT1.10, TF2.5 to sdk1.19.0 and mx1.8 to sdk1.18.0 Signed-off-by: Venky Natham * enable test Signed-off-by: Venky Natham * Add missing files Signed-off-by: Venky Natham * Change mxnet wheel Signed-off-by: Venky Natham * Use the same version of sagemaker-pytorch-inference With sdk1.17.1 dockerfile the version of sagemaker-pytorch-inference is 2.0.8. The latest 2.0.10 for some reason throws up following error InvalidModelException: Model version is not defined. Signed-off-by: Venky Natham * Fix some test Signed-off-by: Venky Natham * Bump the dlc major version Signed-off-by: Venky Natham * Use mamba instead of mini conda Signed-off-by: Venky Natham * add tf 1.15 Signed-off-by: Venky Natham * add torchserve vuln Signed-off-by: Venky Natham * add torchserve vuln to safety check Signed-off-by: Venky Natham * get_dockerfile_path_for_image add neuron sdk version Signed-off-by: Venky Natham * fix the get_dockerfile_path_for_image Signed-off-by: Venky Natham * fix the ecr scan Signed-off-by: Venky Natham * fix the scan Signed-off-by: Venky Natham * Take care of review comments Signed-off-by: Venky Natham * change python version Signed-off-by: Venky Natham * move back to python 3.7.10 Signed-off-by: Venky Natham * revert config changes Signed-off-by: Venky Natham * [NEURON][TEST][MXNET] - Update the allowlist for the test to pass (#1871) * PT1.10, TF2.5 to sdk1.19.0 and mx1.8 to sdk1.18.0 Signed-off-by: Venky Natham * enable test Signed-off-by: Venky Natham * Add missing files Signed-off-by: Venky Natham * Change mxnet wheel Signed-off-by: Venky Natham * Use the same version of sagemaker-pytorch-inference With sdk1.17.1 dockerfile the version of sagemaker-pytorch-inference is 2.0.8. The latest 2.0.10 for some reason throws up following error InvalidModelException: Model version is not defined. Signed-off-by: Venky Natham * Fix some test Signed-off-by: Venky Natham * Bump the dlc major version Signed-off-by: Venky Natham * Use mamba instead of mini conda Signed-off-by: Venky Natham * add tf 1.15 Signed-off-by: Venky Natham * add torchserve vuln Signed-off-by: Venky Natham * add torchserve vuln to safety check Signed-off-by: Venky Natham * get_dockerfile_path_for_image add neuron sdk version Signed-off-by: Venky Natham * fix the get_dockerfile_path_for_image Signed-off-by: Venky Natham * fix the ecr scan Signed-off-by: Venky Natham * fix the scan Signed-off-by: Venky Natham * Take care of review comments Signed-off-by: Venky Natham * change python version Signed-off-by: Venky Natham * move back to python 3.7.10 Signed-off-by: Venky Natham * revert config changes Signed-off-by: Venky Natham * Change the allowlist Signed-off-by: Venky Natham * Release neuron sdk 1.19.0 (#1868) Signed-off-by: Venky Natham * Update autogluon tests to resolve pip check issue (#1872) * Update autogluon 0.4 dockerfiles to resolve pip check issue * update dev config * Update dlc_developer_config.toml * [pytorch][build] Undo NVIDIA GPG Key Fix (#1869) * Undo NVIDIA GPG Key Fix * Revert temp config toml changes Co-authored-by: Sai Parthasarathy Miduthuri Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [test] Skip pip checks on canary (#1881) * Update PT 1.10 E3 and SM images to remove repo.anaconda.com (#1874) Co-authored-by: Kevin Yang Co-authored-by: Sai Parthasarathy Miduthuri * [pytorch][build][test] Fix PT1.10.2 mpi test and CW logs (#1867) Co-authored-by: Sai Parthasarathy Miduthuri * [test] Make the canary image pull logic scalable (#1878) * Make the canary image pull logic scalable * Respond to comments * Check for all python versions of a released image * Respond to comments * Responding to nits * Sort the parsed canary images to prevent diff test failure (#1884) * [NEURON][DOC] - Update available images (#1886) * [test][Canary] Restrict regex based python version extraction for just AutoGluon images (#1885) * [release] Add PT 1.10 E3 and SM DLCs to release (#1887) * [release] Fix release images (#1891) * [pytorch][build][test] update PT 1.11 buildspec to prepare for release of E3 images (#1875) * update PT 1.11 buildspec to prepare for release of E3 images * add PT 1.11 to openssl whitelist * Update test/dlc_tests/sanity/test_pre_release.py Co-authored-by: Kevin Yang Co-authored-by: Sai Parthasarathy Miduthuri * [tensorflow]|[build]|[ec2,ecs,eks] Add E3 dockerfiles for TF2.9 (#1828) * release(tf): Add E3 dockerfiles for TF2.9 * fix: Drop experimental from tf.keras.mixed_precision APIs in tests * fix: more fixes for the dropped experimental APIs * update: Use rc1 binaries * temp: update skip_frameworks for tests * update: use rc2 wheels * temp: try bumping cudnn version * update: use GA release wheels * fix: allow TF2.9 cpu/gpu for CVE sanity tests * fix: allow TF2.9 on CVE-2022-1292 * Revert "temp: update skip_frameworks for tests" This reverts commit b75769f92e78a40ecbe5b385eeb6169b19e7d9f8. * update: upgrade external packages * temp: update skip_frameworks for tests This reverts commit 9b7b5b4aa38f703462225434be6d36cd5ddc2573. * Revert "temp: update skip_frameworks for tests" This reverts commit 7e0db5589a65c2cc8865f148457cfe58a2d2a18e. Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * update huggingface PT 1.10 to remove repo.anaconda.com (#1879) * update huggingface PT 1.10 to remove repo.anaconda.com * remove ruamel_yaml from packages installed * Add ARG MAMBA_VERSION in cu113 Docker file * Revert dlc_developer_config file Co-authored-by: Kevin Yang Co-authored-by: RadhikaB-97 Co-authored-by: Radhika Bhat <78102284+RadhikaB-97@users.noreply.github.com> * [release] Release configuration for TF2.9 E3 Training DLC (#1903) * tf2.9 e3 training config * add rest of configs to template * set disable_sm_tag to false * Add TF2.9 E3 training images to available_images.md (#1906) * [tensorflow] [sagemaker] Add nginx timeout (#1893) Co-authored-by: Sai Parthasarathy Miduthuri * [pytorch][build] Upgrade EFA to 1.15.2 on PT 1.10 Training DLC (#1908) * Upgrade TS version to v0.6.0 (#1898) Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri * Release configuration for HF PT1.10 (#1909) * Release config changes HT PT1.10 training (#1910) * [release] Release PT 1.10.2 DLCs (#1929) * [pytorch][graviton][canary] Fix graviton canary DLC pull test (#1928) * Update pytorch-graviton pillow to 9.0.1 * Test Pytorch Graviton Pillow 9.0.1 docker update * get only graviton images for graviton ARCH * Typo on 948 __init__.py * test_pre_release.py * created better logic for graviton canary images * another parse_canary_images refactor for graviton * re-enable src module. * locking captum to 0.4.1 * reverting version on capton module * prep for merge * renable tests for PR verification * removing unused graviton_tag * put toml back to normal * [test] Disable dependency checks from canaries (#1873) * fix post processing for TF benchmark (#1905) * fix benchmark * revert dev config * [release] Add TF 2.8 DLCs to release images (#1935) * [tensorflow][build][e3] Bump TF version to 2.9.1 (#1930) * update: bump tf to 2.9.1 * fix: version number in buildspec Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [release] TF 2.9.1 release configuration (#1936) * TF 2.9.1 release configuration * add line * [pytorch][graviton][test] remove anaconda and fix TS 0.5.2 test timeout (#1932) * Release Pytorch-Graviton-E3 Image (#1940) * Update pytorch-graviton pillow to 9.0.1 * Test Pytorch Graviton Pillow 9.0.1 docker update * get only graviton images for graviton ARCH * Typo on 948 __init__.py * test_pre_release.py * created better logic for graviton canary images * another parse_canary_images refactor for graviton * re-enable src module. * locking captum to 0.4.1 * reverting version on capton module * prep for merge * renable tests for PR verification * removing unused graviton_tag * put toml back to normal * use conda-forge instead of anaconda. pillow update * removing local test paths * pytorch CPU into allow_openssl_cve_2022_1292 * fixed missing comma in dict * adding opencv to pip for 0.5.2 torchserve * prep for merge * updating pip pin for torchvision * test torchvision pip update * update toml for merge * release pytorch-graviton * [test] Fix dependency check test for AG 0.3 DLCs (#1945) * [test] Fix dependency check test for AG 0.3 DLCs * Correction: * Revert temp changes * [autogluon] [test] Fix AG 0.4 dependency check test (#1957) * [autogluon] Fix protobuf version issue (#1961) * update conda test to exclude .github directory (#1963) * [release] Add AG 0.4 to release spec (#1943) * [mxnet] [build] Get mxnet docker image building. (#1956) * Fix inference and security test for TF graviton (#1964) * build graviton tf image * install protobuf * adjust condition, run black * revert dev config * [release] Add AG 0.3.2 to release images (#1966) * [test] Disable anaconda test (#1965) * [mxnet] [build] Fix mxnet gpu inference build (#1967) * TF2.7 Graviton release (#1969) * [release] Add MX 1.9 to release images (#1970) * [test] Fix test failures on Neuron SDK 1.17.1 DLCs (#1959) Co-authored-by: Venky Natham * [TF][NEURON][BUILD] - For sdk 1.17.1 release use the right tf 2.5.2 version in tag (#1973) * Fix sanity test for TF2.7 (#1955) * Fix sanity test for TF2.7 * fix 2.7.0 inference * revert change * install protobuf * allowlist dep check * fix nvidia env variable * build inference image only * build and test training * revert temp configs * [release] Add Neuron SDK 1.17.1 DLCs to release images (#1972) * [release] Remove released images from release_images (#1974) * [Pytorch][Build][Test] Fix PT1.9.1 training vuls (#1948) * Fix PT1.9.1 training vuls * Make safety fixes * Add pip numpy version >=1.22.2 * Added the attrs=20.3.0 version required by sagemaker 2.92.1 * Uninstall and install attrs * Downgrade sm pysdk * Pin pillow>=9.1.1 * Fix Protobuf * Revert numpy pinning * Change Conda numpy versioning * Numba pin to 0.55.2 * Using numpy==1.19.1 * add multiprocessing start method to spawn * Using numpy-1.22.3 after test fix * datetime tag as false * Added protobuf to prevent sagemaker failures * Preventing new builds, skipping test_smmodelparallel_mnist_multigpu_singlenode for PT1.9 * Taking the smdataparallel_mnist.py from old commit b87590af021a4c3913b3669fa5f9fc10fbebe4e8 * Minor fix * Reverting the smdataparallel_smmodelparallel_mnist_script_mode.sh to commit b87590af021a4c3913b3669fa5f9fc10fbebe4e8, link: https://github.com/aws/deep-learning-containers/blame/b87590af021a4c3913b3669fa5f9fc10fbebe4e8/test/sagemaker_tests/pytorch/training/resources/mnist/smdataparallel_smmodelparallel_mnist_script_mode.sh * Running for release_candidate_integration tests * Making the scripts backward compatible * Making script smdataparallel_mnist backward compatible * Add Speciferset instead of versions * Reverting temp changes Co-authored-by: Daiming Yang * [Huggingface-Pytorch][Inference] Fix HF PT 1.9.1 Inference images (#1971) * HF PT 1.9.1 Inference images * Fixing urllib3 * Adding libopenblas-dev * datetime flag as false and run SM tests * Skipping speech and vision models * Revert temp changes * [huggingface_pytorch] Fix HF PT1.9 pipeline sm local test failures (#1978) * Fix HF PT1.9 pipeline sm local test failures * Add protobuf * Skip frameworks * Revert temp changes * Release huggingface_pytorch 1.9.1 inference images (#1980) * skip safety failure for TF estimator (#1981) * [pytorch][graviton][build] Upgrade torchserve to 0.5.3 (#1982) * [pt-graviton] Upgrade torchserve to 0.5.3 * Prep for merge * [Release][PyTorch][Graviton] upgrade torchserve to 0.5.3 (#1989) * [pt-graviton] Upgrade torchserve to 0.5.3 * Prep for merge * [Release][Graviton] upgrade torchserve to 0.5.3 * Release TF 2.7 e3 and sagemaker images (#1988) * Release TF 2.7 e3 and sagemaker images * Update release_images.yml * [Habana][Pytorch] Fix Habana SynAI1.2 PT1.10.0 (#1952) * Fix Habana SynAI1.2 PT1.10.0 * Changed the test ami ids * Reverting the ami id change as it is not reqd * update conda test and 1292 cve allow list * turn on benchmark mode * fix python version typo * update build spec * revert dev config * update pytorch-lightning * revert dev_config Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan * [Tensorflow-Habana][Build][Test] Fix Habana SynAI1.2 TF2.7 images (#1950) * Fix Habana SynAI1.2 TF2.7 images * Changing the Habana Ami Ids * Reverting the ami id change as it is not reqd * update 1292 cve allow list * turn on benchmark mode * using Habana base v1.2 ami to rerun the SM test * update buildspec name * revert dev config * using synAI1.3 AMI * revert dev config * revert buildspec Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan * [Pytorch][test] Fixing Pt-1.9 SM failures (#1976) * Fixing Pt-19 SM failures * Change file to the older commit(https://github.com/aws/deep-learning-containers/blob/b87590af021a4c3913b3669fa5f9fc10fbebe4e8/test/sagemaker_tests/pytorch/training/resources/smdataparallel/smdataparallel_throughput.py) * Adding version split based on SMDDP team's suggestion to run smdataparallel_throughput_post_ptbackend * Removed incorrect line to test normal working * Revert temp changes * [autogluon][build] AutoGluon 0.4.2 container (#1992) * [tensorflow][build][test] Fix TF 2.6 test errors (#1960) * [tensorflow] [test] Fix TF 2.6 dependency check * Build * Fix * Disable builds and test only CPU Training * Build and test only training, fix dataservice * Run only inference * Install protobuf on client during inference test * Fix GPU dockerfile * Add Training DLCs back to buildspec * Add 48551 to safety check allowlist * Revert config and buildspec temp changes * [Pytorch][Release] Release PT1.9.1 training images (#1993) * Release PT1.9.1 training images * Add space * [pytorch][graviton][test][build] fix pt-graviton conda build failure (#1991) * [pt-graviton] Upgrade torchserve to 0.5.3 * Prep for merge * [Release][Graviton] upgrade torchserve to 0.5.3 * find out why pt-graviton release will not build * changing mamba version to 4.11 for test * going to 4.12.2 for Mamba * mamba version to 4.12.0-2 * running without updating conda itself * move update to after installs keeping conda 4.12 * prep for merge and RC * update comment on conda update move * [release] Add AG 0.4.2 to release_images (#1994) * [release] Add TF 2.6 to release_images (#1995) * [Pytorch][Inference] Fix PT 1.11 Inference Images (#1987) * Change buildspec to include inference only * Upgrade mamba version * downgrade mamba version * Fetch ami dynamically * Upgrade mamba version * Fix error * Fix errors in cu113 * Changes from review * Revert Changes and add comments * Add standard labels to DLCs (#1896) * [Autogluon][doc] AutoGluon 0.4.2 available images update (#1997) * [tensorflow]|[build]|[sagemaker] Add Sagemaker builds for Tensorflow 2.9 (#1858) * release(tf): Add E3 dockerfiles for TF2.9 * fix: Drop experimental from tf.keras.mixed_precision APIs in tests * fix: more fixes for the dropped experimental APIs * update: Use rc1 binaries * temp: update skip_frameworks for tests * feat: a… * [HC Support] Re-release TF 2.9 SM DLCs (#397) * TF 2.9 re-release for HC * install sagemaker and smtf before they are uninstalled * install custom pysdk * set sagemaker model * build only tensorflow * install with deps instead * fix: bump cpu image size * Use latest packages * update cpu artifacts as well * tests: mark binary_visibility as xfail * Fix order of install * run efa * run rc * fix quickcheck * run rc * run standard * run efa * run benchmark test * fix bug in sm local test * Update pysdk path * fix s3 path for pysdk * set datetime_tag to false * address quick check * tests: add tf hc tests * revert temp change * fix hc tests * disable benchmark mode * disable new build * add py39 support to pyversion in conftest * fix typo * fix safety * remove skip for PR context * run efa test * revert toml file Co-authored-by: Nishanth Hegde Co-authored-by: tejaschumbalkar * [HC support] Re-release TF 2.8 DLCs (#400) * common changes to test suite * buildspec and dockerfile changes * toml changes * update branch * adjust buildspec and image size * run efa * run benchmark * run rc * update test_pre_release with master * Revert toml file Co-authored-by: Satish Pasumarthi Co-authored-by: tejaschumbalkar * [HC Support] Re-release PT 1.11 SM DLCs (#395) * PT 1.11 SM binaries * remove example image * install custom pysdk * set sagemaker model * build only pytorch * pin protobuf and install binaries with deps * Use latest packages to address local mode issues * update cpu artifacts as well * mark binary_visibility as xfail * pin protobuf and install binaries with deps in gpu file * Fix order of install * run efa * run rc * fix quickcheck * Add tests for HC * run standard test * fix formatting * update pysdk path * address quick check * skip fastai test * update s3 bucket * revert temp change * disable builds * run benchmark test * run efa test * fix safety * remove instance_type when instance_groups is specified * run standard test * use hc prefix for sm job names and revert dlc major version * rebase Co-authored-by: tejaschumbalkar Co-authored-by: Nishanth Hegde Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * allowlist CVE-2022-2068 (#404) * [HC Support] Re-release TF 2.7 SM DLCs (#402) * HC re-release for TF2.7 * fix buildspec * fix buildspec * run efa * run benchmark test * run rc * update test_pre_release with master * revert developer toml Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [HC support] Re-release PT 1.10 SM DLCs (#399) * feat: heterogenous cluster support for PT 1.10 * fix: version in buildspec * fix: toml file * fix: buildspec * run efa + standard * reformat buildpsecfiles * fix buildspec * revert buildspec as e3 images needs to be built for base * run benchmark test * run rc * fix build * revert toml and multi region changes from tejas * revert trcomp Co-authored-by: tejaschumbalkar Co-authored-by: Satish Pasumarthi * [HC Support] Re-release TF2.6 SM DLCs (#403) * HC re-release for TF2.6 * Fix buildspec * fix buildspec and image size * run efa * run benchmark * run rc * update test_pre_release with master * Fix buildspec * build and test * parametrize smdataparallel tests to include p4d * pin compatiblle version of numpy * fix parameterized hc test * fix multi region * run benchmark * fix benchmark * sageamker local test region change * sagemaker local test fix * disable benchmark * run benchmark * fix pip checks and modelparallel failure * build image * skip pip failure * run sanity * disable benchmark * revert trcomp * build image * remove force install * revert unneeded change * revert dev config * build example images for gpu * Add hc prefix for sm jobs to distinguish Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> Co-authored-by: Nishanth Hegde * release images for HC support (#408) * Revert "release images for HC support (#408)" (#409) This reverts commit fe08eec242c928543440e78a71961071c4e02a69. * Revert "Revert "release images for HC support (#408)" (#409)" (#410) This reverts commit 70f6a5a1b0b0eefc04344b9682c68ec9564b0865. * revert requirements (#414) * move HC support imports to the respective test (#415) * fix import * fix import * fix HC import for TF SM test (#416) * Test GAMMA release of PT neuron training 1.10.2 (#412) * Revert temporary HC changes and sync public master (#419) * change cudnn version for tf2.8 for compatibility with p2 instances (#1688) * update cudnn version * update buildspec * test on p2 instance * revert dev config Co-authored-by: Qingzi-Lan * Habana release v1.2 images for TF and PT (#1687) * release v1.2 * nit * habana release v1.2 (#1691) * Bump numpy in /test/sagemaker_tests/pytorch/inference (#1679) Bumps [numpy](https://github.com/numpy/numpy) from 1.16.4 to 1.21.0. - [Release notes](https://github.com/numpy/numpy/releases) - [Changelog](https://github.com/numpy/numpy/blob/main/doc/HOWTO_RELEASE.rst.txt) - [Commits](https://github.com/numpy/numpy/compare/v1.16.4...v1.21.0) --- updated-dependencies: - dependency-name: numpy dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [NEURON][BUILD][HF] - move hf neuron dlc to use latest sdk (#1669) * [NEURON][BUILD][HF] - use ubuntu18 (#1700) * use ubuntu18 Signed-off-by: Venky Natham * enable test Signed-off-by: Venky Natham * remove libtinfo6 install as that is specific to u20 Signed-off-by: Venky Natham * Update dlc_developer_config.toml Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [NEURON][BUILD][TF] - Move tf2.5.2 neuron to sdk 1.17.1 (#1696) * [NEURON][BUILD][MX] - Move to neuron sdk1.17.1 (#1698) * [NEURON][BUILD][PT] - Move pt1.10 to neuron sdk1.17.1 (#1699) * [NEURON][BUILD][TF] - Move tf1.15.5 to use neuron sdk 1.17.1 (#1697) * Release neuron sdk 1.17.1 version (#1702) Signed-off-by: Venky Natham * [doc] Update available images for neuron sdk release 1.17.1 (#1703) * Add release images definition for HF PyTorch Neuron (#1694) * [PyTorch E3] PT 1.10.2 DLC release (#1683) * pt1.10.2 * add dgl * update vision binaries * update numpy and pillow versions * fix numpy 1.22.0 installation * update versions for cpu * pin ipython version * fix ipython installation * update dgl pt container tests * config for e3 only * pin numpy version * skip CVE 44463 * fix format * update dev config * update dev config * disable dgl * disable dgl cpu test for eks * revert graviton changes * revert sagemaker wheel * remove pt1.10.0 buildspec * revert dev config * Update dlc_developer_config.toml Co-authored-by: Wei Chu Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * Add tf2.7 training sagemaker dockerfiles (#1628) * add tf2.7 sagemaker dockerfiles * update buidlspec * remove non-compatible python packages * add dependencies for kebros * use manylinux wheels; add sagemaker dockerfiles * update horovod installation env vars * update horovod installation script * use numpy as tensorflow dep * Update buildspec.yml * install boost * increase image size limit * update pillow and add docker lables * use wheels from smdebuggers pipelines * fix sanity test * add labels for tf 2.7 sm cpu * rerun * build+rerun * reinstall horovod cpu * install smdebug directly from tag * fix typo; * Revert "fix typo;" This reverts commit c5bd300d2141a91ac4f3f1d6d13711aa975370cb. * Revert "install smdebug directly from tag" This reverts commit c51ef6b95b20de6f65397f34a29806ab77c03461. * Executing safety check in PR * install smdebug directly from the branch * bump up tensorflow to 2.7.1 * install higher version of tensorflow-io to avoid overriding tensorflow * Ignoring a false positive vulnerability * install tfds * change pytest comands * do not install dependencies as they have been installed in the dockerfiles * add SAGEMAKER_TRAINING_MODULE environment variable * remove pem file in tmp folder * update sagemaker-tensorflow * add smdataparallel * revert rm /tmp * remove /tmp/git-secrets * experiment with an smdebug fix * Revert "experiment with an smdebug fix" This reverts commit b19ee8347ed6208ff9c2ac81d489dba785632199. * skip test_keras_mirrored.py * fix error in buidlspec * Revert "fix error in buidlspec" This reverts commit b973fa415e324a6f63d1fb816b22848e35600934. * revert developer_config * fix buildspec * fix buildspec * fix py version * revert buildspec to mainline Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: tejaschumbalkar Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> Co-authored-by: Sergey Togulev Co-authored-by: Qingzi-Lan Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * pt1.10.2 release images (#1706) * pt1.10.2 release images * add example * TF2.8: Clean up dockerfiles, update HVD test (#1693) * update pt1.10.2 release images (#1707) * update pt1.10.2 release images * Update release_images.yml Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [Build][tensorflow] fix TF27 GPU CVE-2022-24407 (#1710) * test * update * udpate * update * should fail * test cpu and gpu * update gpu sasl package * udpate libsasl manually * update * add TF27 release images (#1714) * [Tensorflow] add comment on py39 installation on TF 2.8 dockerfiles (#1715) * document TF28 dockerfile * update * Release TF2.8 e3 images (#1716) Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * [Tensorflow][Test][ec2] Fix Habana Tensorflow EC2 tests (#1704) * Changed dev config to build images * Added safety check test true * Changed the build to true in buildspec * Add logic to upload and read from s3 with a break statement * Remove break and fix tail bug * Change loop time and last line of script * Added modularity * Removing unwanted logs * Modifying the while loop to check if the test can end early * Reformatting the code * Fixing bugs and refactoring * Minor fix * refactored code and added buckets for each account * Refactored to include the ValueError within execute_async method * Implemented bucket logic * Reverting temp changes Co-authored-by: Shantanu Tripathi * bug fix (#1717) * re-releas TF27 sagemaker cpu training (#1720) * [build][pytorch] pt1.10 add openssh support (#1619) * [tensorflow] Bug fixes to TF2.8 E3 images (#1723) * [tensorflow] Bug fixes to TF2.8 E3 images * add sasl install * upgrade sasl instead of reinstalling * Revert "upgrade sasl instead of reinstalling" This reverts commit 51eb07408a404edde16e5bb2ddb3aa3b782a37a7. * [Habana] [test] [ec2, sagemaker] Fix to skip SM tests for Habana and modify async testing API (#1724) * Fix to skip SM tests for Habana and modify async testing API * Added the hang detection window variable * Revert developer config Co-authored-by: Shantanu Tripathi * Move sasl to upgrade instead of install (#1726) * Add dependabot config file to scan Dockerfiles (#1727) * Add dependabot config file to scan Dockerfiles * Update dependabot.yml * [PyTorch] PyTorch 1.10.2 SageMaker DLC (#1709) * pt1.10.2 sm dlc * merge from upstream master * refactor smdebug installation * set enable_test_promotion:false for e3 Co-authored-by: Wei Chu * Configured release_images.yml for TF2.8e3 re-release and PT1.10.2 SM release (#1737) * Configured release_images.yml for TF2.8e3 re-release * Update release_images.yml * Add Pytorch release changes to the yml Co-authored-by: Shantanu Tripathi Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [build][pytorch] pytorch 1.9 add openssh support (#1621) * add openssh support * build training image only * revert dev config * update * update package version * udpate * revert dev config * [tensorflow] Add dockerfiles for TF2.8 (#1685) * add sagemaker dockerfiles * update developer config * update buildspec * fix typos * fix typo for python version * add smdebug * add sagemaker-tensorflow * add smdataparallel * remove tmp files * update test config * remove wrong ldlib path * update tensorflow-io version` * remove sagemaker-tensorflow til py39 pkg become available * remove sagemaker-tensorflow * add sagemaker-tensorflow * install sagemaker-tensorflow from source * install tfds * do not install tesnorflow-dataset in the tests as it was installed in the image * set datetime_tag to false * correct python version * update buildspec * pass arguments related to python to e3 and sagemaker stages as env vars * install smdebug from the tag * minor update for sagemaker-tensorflow installation * bug fix * Changes to config file * Make fix for cyrus CVE * Change configs file to disable safety_check_test * bump up requests * run benchmark without rebuild * run sagemaker rc tests * run efa tests * unistall tfds as it is installed in the image already * run rc tests * remove unused env vars * fix license * update buildspec to build sagemaker images only * Revert "update buildspec to build sagemaker images only" This reverts commit 908c89dcec178fe964346516cf12f52b6448868d. * skip safty checks * remove license from sagemaker stage * revert dlc_developer_config.toml * remove unused comments * skip test_keras_mirrored for TF2.7 * fix styling issues * add env var for TF version * comment out e3 and example images build Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Shantanu Tripathi Co-authored-by: Shantanu Tripathi * Update available images for TF2.7 SM and TF2.8 E3 (#1741) * [TensorFlow] bump up tensorflow to 2.6.3 (#1721) * [TensorFlow] add sagmaker dockerfiles for tensorflow_serving (#1689) * add sagmaker dockerfiles * build sagemaker image * pass build args as env variables to sagemaker stages, remove unused dockerfle Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri * [tensorflow] [build] [test] TF2.8 SM image fix (#1748) * TF2.8 SM image fix * Rebuild images with new SMDebug tag released * Changed the smdebug versioning format * Removed additional code for skipping tests * Change buildspec and revert temp changes * Added newline at ends of buildspecs * [tensorflow][build][sagemaker] enhance gunicorn logging (#1750) * [autogluon][build] AutoGluon 0.3.1 container patching (#1734) Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> * [autogluon][build] AutoGluon 0.3.2 container (AG 0.3.1 with patched images) (#1752) * [release] Add TF 2.8 SM DLCs to release images (#1755) * [doc] Update available images (#1754) * [release] Add AG 0.3.2 images to release (#1757) * [Tensorflow][Test][benchmark][ec2] Invoke all the Habana benchmark tests using async execution (#1711) * Basic config for building images and running the tests * Added timeout for benchmark runs * Invoking async execution for the benchmark test * Added uuid to logs and increased loop time * Added background process into ec2 for uploading logs * Making the function background connection.run async * Reverted to connection run logic * Shift to background process execution without breaking in case of no progress * Changes with different filenames * Minor fix * Run all habana benchmark tests asynchronously * Run all tf tests * Removed PT from being skipped * Made minor changes * Removed comments * Reverting changes in execute_asynchronus_testing_using_s3_bucket * Revert testrunner * Added minor docstring * Revert buildpsecs * Reverted all temp changes Co-authored-by: Shantanu Tripathi * [tensorflow][build][test] Fix test failures for TF 2.6 dockerfiles (#1756) * [huggingface_pytorch] upgrade Pytorch to 1.10 (#1630) Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Venky Natham Co-authored-by: Sai Parthasarathy Miduthuri Co-authored-by: Rahul Huilgol * [release] Remove customer_type from AG spec (#1758) * [autogluon][test][sagemaker] Add support for py38 version in tests (#1760) * [pytorch] [build] [E3] PyTorch 1.11.0 (#1719) * [Habana]|[Build]|[Test] Fix Habana PT benchmark issues due to framework version regex match (#1742) * Fix Habana PT benchmark issues due to framework version regex match * Revert temp changes * [pytorch] [build] [E3] Scope multistage ARGs globally in PyTorch 1.11.0 (#1762) * [release] Add TF 2.6.3 and HF PT 1.10.2 (#1766) * [release] Re-release TF2.7 inference images as TF2.7.1 (#1767) * Habana DLC 1.3.0 Release (derived from PR #1692) (#1722) * first version of 1.3.0 * fix mistakes * habana mode enabled * fix PT BERT requirements with new pip * update framework binaries * keep torchvision * Empty Commit * Running safety test and disabling datetiem tag as an experiment * remove some tools for TF * Make changes to asyn test execution * Unpin absl-py * Run benchmark tests * Empty commit to rerun tests * Add fix for PT tests * Enable datetime tag to ensure new image is made * Changed PT dockerfile to use the dlc.py from a specific commit * Add COPY command to copy the dlc_container file * Increased timeout limit * Changed hang detection window to 30 * Change Habana DLAMI for tests * Allow building images * Rerun tests with reduced window * Check ec2 tests * Rerun the sanity tests * Reformatting to add enable_habana_async_execution * Making exception for PT1.10.1 * Rebuilding image to see if the openssl installed is 1.1.1f-1ubuntu2.12 * Added pytorch lightning additional vulnerability to ignore list * TF2.8 hpu images skip openssl issue * Undo PT fix changes * Revert temp changes * Responded to nit comments Co-authored-by: Buke Ao Co-authored-by: Omri Almog Co-authored-by: Wei Chu Co-authored-by: omrialmog Co-authored-by: Shantanu Tripathi * [huggingface_tensorflow] upgrade Tensorflow to 2.6 (#1629) Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Qingzi-Lan * [autogluon][build] AutoGluon 0.4.0 container (#1759) * [ec2][test] Fix PT 1.11 test issues (#1772) * [huggingface_tensorflow][build] Fix TF 2.6.3 binary source in dockerfile (#1771) * [huggingface_tensorflow] Fix TF 2.6.3 binary source in dockerfile * Run local tests * update ignore safety ids * set to RC test * fix typo * revert dev config Co-authored-by: Qingzi-Lan Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * [Release][Tensorflow] release HF TF 2.6.3 (#1775) * [release] Add PT 1.11.0 to release_images (#1776) * [Habana] Get commit using env variables (#1778) * Get commit using env variables * Revert temp changes and respond to nit * [doc] Add AutoGluon 0.3.2 to available images (#1761) * [Habana] Fixing license file for Tensorflow (#1781) * Habana: Fixing license file for Tensorflow * Disabling reruns for Habana tests * Revert the temp changes * Habana PT1.10.1 and TF2.8.0 release (#1784) * Habana PT1.10.1 and Tf2.8.0 release * Removed older releases * [pytorch][build] upgrade ts to torchserve-nightly==2022.3.23.post in PT1.10 (#1779) Co-authored-by: Sai Parthasarathy Miduthuri * [mxnet][build] Update mx1.9.0 cpu binaries (#1773) Co-authored-by: Wei Chu Co-authored-by: kevinyang8 <40340762+kevinyang8@users.noreply.github.com> * [release] Add AutoGluon 0.4 DLCs to release images (#1786) * [Autogluon][doc] AutoGluon 0.4.0 available images update (#1789) * Add MXNet 1.9 CPU to release images (#1792) * Add MXNet 1.9 CPU to release images * add sagemaker for mxnet release images * disable force release Co-authored-by: Kevin Yang * [eks][test] EKS cleanup workflow (#1788) * add cron files * adjust function caller * reorganize files * add loggers * nit change * reformat * update dockerfile * remove unwanted comments * spell check * fix import (#1793) * [pytorch][1.10][training] fix typo in mpirun path (#1774) * disable part test (#1798) * [PyTorch][1.10] Fix SM Local tests (#1797) Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> * Reduced the default TRAINING_TEST for benchmark tests. (#1785) * Reduced the default TRAINING_TEST for benchmark tests. * Adding config file * Running ec2 tests * Revert temp changes Co-authored-by: Shantanu Tripathi * [pytorch] [build] PT1.11.0 - reduce PyTorch GPU Train image size (#1783) * move torchnet to torch package installations * collapse torch package installation commands * change buildspec to build train E3 GPU only * update torch packages installation commands in all dockerfiles * update buildspec to build all E3 images * add torchnet installation * [pytorch][build][E3] Update PyTorch 1.11.0 cuda 11.3 Dockerfiles (#1795) * change GPU train cuda 115 to 113 * update buildspec to build e3 gpu train only * edit dockerfile path to cu113 * update inference GPU from cu115 to cu113 * update buildspec to build all e3 images with cu11.3 * save cu115 for reference purpose * update PyTorch installation lines to reduce image size * update package versions on GPU images * build E3GPU image only * update OFI version from 1.1.3-aws to 1.2.0-aws * update buildspec to build all e3 images * update EFA to 1.15.1 to upgrade libfabric * update EFA installation strategy for dependencies * fix openmpi path as #1787 * fix sagemaker section buildspec and gpu train * un-pin libcudnn8 as it's backward compatible * fix mpi command in sagemaker image * add apt-get clean and rm apt/lists/* for all apt-get commands * save buildspec for pt1.11.0-cu115 * update image baseline sizes in buildspec Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * re-release PT1.10.2 (#1807) * enable overwritable MMS parameter (#1806) Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [test] Add tests for sagemaker environment variables (#1810) * Add SM env variable tests * update dev toml * Update dlc_developer_config.toml * [Pytorch][build] update OFI and EFA version (#1804) * udpate OFI and EFA version * install packages before clean apt cache * update efa installation * address comments * fix typo * revert dev config * [Doc]Update HF Trcomp image uri (#1814) * Update HF Trcomp iamge URI * seperate current and previous images * [tensorflow][inference] Add test to validate TF serving versioning (#1809) * update PT1.11 image uri in available_images (#1817) * Release sagemaker model parallel binary for HF DLC (#1800) * update smp binary * enable tests * revert testing changes * update smmpmp binary * update smmp binary Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: haohanchen-yagao Co-authored-by: Junpu Fan * [sanity] Update sanity tests to use separate prefixes (#1819) * Update sanity tests to use separate prefixes * Update __init__.py * Graviton eks infrastructure (#1579) * initial commit * add pre-deploy * add nodegroup support * modify eks buildspec * build a cluster * add kubeconfig * nit change * revert temp changes * explictly set managed node * remove managed option * add option to upgrade nodegroup * nit change * template update Co-authored-by: Ubuntu Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan * [build]|[test] Adds Web scraper Module and handles automations related to ECR scan. (#1241) * Adds rudimentary crawler over cve urls * Processes rows and takes care of packages * Added comments and changed the string processing logic * Added yield to store the data in 1 json by directly using the scrapy command * Added extra info for pathc key and included comments * Fixes the issue of having extra commas * Preserve the order * Adds README * Adds more information to README * Added Environment Variable and note table processing * Changed Readme * Minor README changes * Minor README changes * Added settings to store json beautifully * Added test script to call the crawler from script * Removes the dependency on Environment Variable, Uses Script to call Spider * Added code to store data in user specified file * Rudimentary commit to integrate web scraper with Ecr test workflow * Added scrapy and crochet in requirements.txt * Fixes * Added apt upgrade working * Added the functionality for conducting union. * Added the file saving mechanism * Adding dummy allowlist * Changes to allowlist * Added s3 functionality * Made the changes required to send data to ECR-AS lambda * Added the functionality to invoke the lambda * Changed key names * Added medium threshold setter * Added apt upgrade lists * Logic for changing apt-upgrade-list.txt * Fix multiple commits by custom sorting * Adding modifications for inference files * Change buildspec file * Changes location of upgrade list * Just build cpu * Added context for apt-upgrade-list to buildspec * Retrigger builds * Refactoring and adding code for allowlist update * Logged vulnerability lists in s3 * Added wait time on scraper * Rerun the tests * Adding more details in case of apt update failure * Descriptive update and upgrade failures * Added hide and warn as true in update/upgrade * Added multiple retries for apt upgrade * Added Logging in security check * Added extra info during logs and renamed scraper runner * Ran black on files * Refactored test_security_check to save the ecr_image_vulnerability_list as well * Removed scraped_data.json and prevented example images from allowlist test * Added allowlist for Training Dockerfile cpu * Add allowlists for train and inf mxnet 1.8 images * Revert neuron changes * Fix the scraper to fetch the packages * Partially responded comments * Refactore get_new_image_uri method * Refactored s3 bucket and set apt_ran_successfully to false * Removing MINIMUM_SEV_THRESHOLD and running formatter * Refactored the test_security_check to move code to security.py * Added apt-upgrade-list functionality to scale to all the processor types * Added comments and rectified indentation * Rectified dictionary operations * Extend Neuron Coverage * Changed developer config * Partial refactoring on security.py * Refactoring get_sorted_vulnerability_list * Refactored security.py * Removed fetch_other_vulnerability_lists from test_security_check.py * Changed trshanta-bucket to ecr-scan-helper * Deployed new lambda and ecr repo * Running after giving access to lambda * Addressed feedbacks related to repo and bucket names * Moved files to neuron sdk1.17.1 * Replacing original buildspec with buildspec 1.8 yml * Build neuron changes after merge * Added package mapping * Change mxnet neuron allowlist * Non neuron mode * dev config just run the tests * Allow single lambda invocation per day * Revert temp changes and buildspec changes * Fix quick pr issues * Fix quick pr issues crochet * Shift web scraper imports to required functions * [eks] Upgrade EKS nodegroups and enable eks test for graviton (#1821) * ung * enable eks test for graviton * build image * disable config * [test] Add a common file to ignore vulnerabilities affecting all containers (#1822) * Add a common allowlist file to ignore high level vulnerabilities that affect all DLCs * Reverting temp changes * [huggingface_pytorch, huggingface_tensorflow] Add support for speech & vision (#1777) * add ffmpeg * add tests and audio file * add ignore again for model files * fixed image path * fix install * fix serializer import * fixed audio naming * added inference toolkit v2 * removed remote package * changed instance type to ml.p3.2xlarge * bumped numpy for cve * Update Dockerfile.cpu * bumpy numpy to numpy>=1.22.3 * revert numpy version bump * udpate cve whitelist * update cve whitelist * update package checking * revert image.py * build inf image only * use HuggingFaceModel class * udpate image size * udpate CVE whitelist * update CVE whitelist * update CVE whitelist * added exception for TF and automatic-speech-recognition pipeline * revert dev config Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Qingzi-Lan Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * [Triton][docs] Added NVIDIA Triton CPU image (#1803) * [test] Remove redundant scans on the canary (#1829) * Remove redundant scans on the canary * Minor fixes * Revert temp changes * Release sagemaker model parallel binary for DLC (#1801) * update smp binary * enable test; change build spec * update the HF version * change for efa test * update smmpmp binary * update smmp binary * fix repeated test name * Revert "fix repeated test name" This reverts commit 6a46408c823f273275342bb8f1413fb7a34d8215. * revert dlc_developer_config.toml and buildspec.yml Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: haohanchen-yagao Co-authored-by: Junpu Fan Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * Update available images markdown file (#1832) * [test] Refactor test_security_check (#1830) * [test] Remove install from repo.anaconda.com for PT 1.11 (E3) images (#1812) * remove install from repo.anaconda.com from PT 1.11 images * update training images and developer config file * add sanity test to check repo.anaconda.com present in image * fix env variable * remove CONDA_PREFIX environment variable due to problems with mamba installation * Update dlc_developer_config.toml * Update test_anaconda.py * update anaconda sanity test * update anaconda test: * disable build in toml * address comments for test and update black version * add mamba version arg * fix env var * fix mamba arg * remove whitespace and make mamba version arg consistent * add to runtime error message * revert developer config toml * Update test/dlc_tests/sanity/test_anaconda.py Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> * set datetime to true Co-authored-by: Kevin Yang Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> * [bug] Fix bug in canary pull logic (#1827) * [bug] Fix bug in canary pull logic * Update __init__.py * Update __init__.py * Update __init__.py * Fix e3 image adding logic (#1834) * [test] Restrict image coverage for allowlist feature (#1833) * Restrict image coverage for allowlist feature * Use specifierset * Add libopenmpt to apt_package_name * Revert temp changes * refactoring nit * [canary] Update canary regex (#1835) * [canary] Update canary regex * remove temporary canary * Update dlc_developer_config.toml * [release] Release HF image updates from #1777 (#1836) * add hf tf * add hf pytorch * Enable MXNet builds and fix syntax warning in test_utils (#1838) * Fix syntax warning in test_utils * Enable MXNet builds * more syntax fix * Update test_pre_release.py * [release] Release MXNet 1.9 E3 images (#1842) * [release] Empty release_images.yml * release MX 1.9 containers * Add example images * [test] Fix canary failures for gpu images (#1837) * Fix canary failures for gpu images * Change the logic and run the tests * Remove image digest * Using pre-built functions * Revert temp changes * [huggingface_pytorch] Update MMS version for PT1.9 to enable logs again (#1823) * update mms version * update mms version * Update dlc_developer_config.toml * run rc tests * Update dlc_developer_config.toml Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [test] Fix sorted package order for apt-upgrade-list (#1854) * Fix package order for apt-list * Add temp config file * Revert temp changes * [build] Rectify apt errors arising due to nvidia key rotation (#1855) * Rectify apt errors arising due to nvidia key rotation * getting new key * Minor fix * Another fix from the nvidia forums * Fetch keys * Revert temp changes * [pytorch][build][SM] Update PyTorch 1.11.0 cuda 11.3 Dockerfiles (#1799) * change GPU train cuda 115 to 113 * update buildspec to build e3 gpu train only * edit dockerfile path to cu113 * update inference GPU from cu115 to cu113 * update buildspec to build all e3 images with cu11.3 * save cu115 for reference purpose * update PyTorch installation lines to reduce image size * update package versions on GPU images * build E3GPU image only * update OFI version from 1.1.3-aws to 1.2.0-aws * update buildspec to build all e3 images * update EFA to 1.15.1 to upgrade libfabric * update EFA installation strategy for dependencies * fix openmpi path as #1787 * fix sagemaker section buildspec and gpu train * build all images * un-pin libcudnn8 as it's backward compatible * fix mpi command in sagemaker image * add apt-get clean and rm apt/lists/* for all apt-get commands * udpate SM gpu binary link * save buildspec for pt1.11.0-cu115 * install smdebug from source for 1.0.15 * remove wrong pytorch installation command in cu115 sm * update image baseline sizes in buildspec * install smdebug 1.0.15 from source in sm cpu image * disable tests for e3 images * update sm cpu binary link * Revision A: sagemaker standard + sagemaker local * disable datatime_tag * specify cudnn version and not apt-get upgrade for EFA * do_build = false * update DGL sagemaker gcn.py * skip all frameworks except pytorch * skip fastai tests for PT1.11 * Revision: sagemaker standard * Revision: sagemaker rc * Revision: sagemaker efa * update smd data parallel binary url * update SMD binary links format * update developer config, revision A: sagemaker standard * revision B: sagemaker rc * revision C: sagemaker efa * revision: sagemaker rc + local * revision: sagemaker rc + local; do_build=false * revision: sagemaker standard; do_build=false * revision: sagemaker efa; do_build=false * do_build inference images * update torchserve to 0.5.3 * update to install torchserve and torch-model-archiver nightly 2022.3.23 * update python version to 3.8.13, to resolve CVE issue * build all sagemaker images * enable datetime_tag; sagemaker standard test * remove base_iamge_name for sagemaker images * rename telemetry env test from training to inference * do_build=false * Install smdebug 1.0.16 from source * install torch-model-archiver stable instead of nightly * do_build=true and sagemaker rc tests * do_build = false * revision: sagemaker standard * revision: sagemaker rc * revision: sagemaker efa + local * datetime_tag=false; do_build=true * revision: sagemaker standard * revision: sagemaker rc * do_build=false * revision: sagemaker efa + local * update smdmp binary url * do_build=true; revision: sagemaker standard * do_build=false; revision: sagemaker rc + local * revision: sagemaker efa * update horovod to version 0.24.3 * do_buld=true; revision: sagemaker standard * do_build=false; revision: sagemaker rc * revision: sagemaker efa + local * update PT 1.11 cu115 smd binary arg format * update PT_E3_INFERENCE_URL to PT_INFERENCE_URL * sagemaker standard tests * update sagemaker dgl test * update sagemaker dgl test * update sagemaker test_dgl to use gcn.py * update gpu arg parse for sagemaker dgl gcn test * update SMDATAPARALLEL_BINARY to SMD_DATA_PARALLEL_URL * do_build=true; sagemaker standard * do_build=false; sagemaker rc * update dlc_major_version to 2 * update comment on smdebug source install * do_build=true: sagemaker standard * Enable ec2 tests for SM since cuda version differs for e3 and sm images * Only cu113 e3 images should have v2 * Change major versions in training dockerfile cu113 * Fix Nvidia key rotaion errors * Fxing for inference images * Revert temp changes * Changed keys to Ubuntu20.04 * Implement another NVIDIA key roation solution * Revert temp changes Co-authored-by: Zeeshan Ashraf Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Shantanu Tripathi * [test] Fix locally missing images in canary (#1856) * Fix locally missing images in canary * Disable image deletion * Revert temp changes * Release PT1.11 SM CPU/GPU TRAIN/INF images (#1860) * Modified available images to add PT1.11 SM images (#1861) * empty release images (#1862) * [release] Add TF 2.8 inference DLCs to release (#1864) * [NEURON][BUILD] - PT1.10, TF2.5 to sdk1.19.0 and mx1.8 to sdk1.18.0 (#1859) * PT1.10, TF2.5 to sdk1.19.0 and mx1.8 to sdk1.18.0 Signed-off-by: Venky Natham * enable test Signed-off-by: Venky Natham * Add missing files Signed-off-by: Venky Natham * Change mxnet wheel Signed-off-by: Venky Natham * Use the same version of sagemaker-pytorch-inference With sdk1.17.1 dockerfile the version of sagemaker-pytorch-inference is 2.0.8. The latest 2.0.10 for some reason throws up following error InvalidModelException: Model version is not defined. Signed-off-by: Venky Natham * Fix some test Signed-off-by: Venky Natham * Bump the dlc major version Signed-off-by: Venky Natham * Use mamba instead of mini conda Signed-off-by: Venky Natham * add tf 1.15 Signed-off-by: Venky Natham * add torchserve vuln Signed-off-by: Venky Natham * add torchserve vuln to safety check Signed-off-by: Venky Natham * get_dockerfile_path_for_image add neuron sdk version Signed-off-by: Venky Natham * fix the get_dockerfile_path_for_image Signed-off-by: Venky Natham * fix the ecr scan Signed-off-by: Venky Natham * fix the scan Signed-off-by: Venky Natham * Take care of review comments Signed-off-by: Venky Natham * change python version Signed-off-by: Venky Natham * move back to python 3.7.10 Signed-off-by: Venky Natham * revert config changes Signed-off-by: Venky Natham * [NEURON][TEST][MXNET] - Update the allowlist for the test to pass (#1871) * PT1.10, TF2.5 to sdk1.19.0 and mx1.8 to sdk1.18.0 Signed-off-by: Venky Natham * enable test Signed-off-by: Venky Natham * Add missing files Signed-off-by: Venky Natham * Change mxnet wheel Signed-off-by: Venky Natham * Use the same version of sagemaker-pytorch-inference With sdk1.17.1 dockerfile the version of sagemaker-pytorch-inference is 2.0.8. The latest 2.0.10 for some reason throws up following error InvalidModelException: Model version is not defined. Signed-off-by: Venky Natham * Fix some test Signed-off-by: Venky Natham * Bump the dlc major version Signed-off-by: Venky Natham * Use mamba instead of mini conda Signed-off-by: Venky Natham * add tf 1.15 Signed-off-by: Venky Natham * add torchserve vuln Signed-off-by: Venky Natham * add torchserve vuln to safety check Signed-off-by: Venky Natham * get_dockerfile_path_for_image add neuron sdk version Signed-off-by: Venky Natham * fix the get_dockerfile_path_for_image Signed-off-by: Venky Natham * fix the ecr scan Signed-off-by: Venky Natham * fix the scan Signed-off-by: Venky Natham * Take care of review comments Signed-off-by: Venky Natham * change python version Signed-off-by: Venky Natham * move back to python 3.7.10 Signed-off-by: Venky Natham * revert config changes Signed-off-by: Venky Natham * Change the allowlist Signed-off-by: Venky Natham * Release neuron sdk 1.19.0 (#1868) Signed-off-by: Venky Natham * Update autogluon tests to resolve pip check issue (#1872) * Update autogluon 0.4 dockerfiles to resolve pip check issue * update dev config * Update dlc_developer_config.toml * [pytorch][build] Undo NVIDIA GPG Key Fix (#1869) * Undo NVIDIA GPG Key Fix * Revert temp config toml changes Co-authored-by: Sai Parthasarathy Miduthuri Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [test] Skip pip checks on canary (#1881) * Update PT 1.10 E3 and SM images to remove repo.anaconda.com (#1874) Co-authored-by: Kevin Yang Co-authored-by: Sai Parthasarathy Miduthuri * [pytorch][build][test] Fix PT1.10.2 mpi test and CW logs (#1867) Co-authored-by: Sai Parthasarathy Miduthuri * [test] Make the canary image pull logic scalable (#1878) * Make the canary image pull logic scalable * Respond to comments * Check for all python versions of a released image * Respond to comments * Responding to nits * Sort the parsed canary images to prevent diff test failure (#1884) * [NEURON][DOC] - Update available images (#1886) * [test][Canary] Restrict regex based python version extraction for just AutoGluon images (#1885) * [release] Add PT 1.10 E3 and SM DLCs to release (#1887) * [release] Fix release images (#1891) * [pytorch][build][test] update PT 1.11 buildspec to prepare for release of E3 images (#1875) * update PT 1.11 buildspec to prepare for release of E3 images * add PT 1.11 to openssl whitelist * Update test/dlc_tests/sanity/test_pre_release.py Co-authored-by: Kevin Yang Co-authored-by: Sai Parthasarathy Miduthuri * [tensorflow]|[build]|[ec2,ecs,eks] Add E3 dockerfiles for TF2.9 (#1828) * release(tf): Add E3 dockerfiles for TF2.9 * fix: Drop experimental from tf.keras.mixed_precision APIs in tests * fix: more fixes for the dropped experimental APIs * update: Use rc1 binaries * temp: update skip_frameworks for tests * update: use rc2 wheels * temp: try bumping cudnn version * update: use GA release wheels * fix: allow TF2.9 cpu/gpu for CVE sanity tests * fix: allow TF2.9 on CVE-2022-1292 * Revert "temp: update skip_frameworks for tests" This reverts commit b75769f92e78a40ecbe5b385eeb6169b19e7d9f8. * update: upgrade external packages * temp: update skip_frameworks for tests This reverts commit 9b7b5b4aa38f703462225434be6d36cd5ddc2573. * Revert "temp: update skip_frameworks for tests" This reverts commit 7e0db5589a65c2cc8865f148457cfe58a2d2a18e. Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * update huggingface PT 1.10 to remove repo.anaconda.com (#1879) * update huggingface PT 1.10 to remove repo.anaconda.com * remove ruamel_yaml from packages installed * Add ARG MAMBA_VERSION in cu113 Docker file * Revert dlc_developer_config file Co-authored-by: Kevin Yang Co-authored-by: RadhikaB-97 Co-authored-by: Radhika Bhat <78102284+RadhikaB-97@users.noreply.github.com> * [release] Release configuration for TF2.9 E3 Training DLC (#1903) * tf2.9 e3 training config * add rest of configs to template * set disable_sm_tag to false * Add TF2.9 E3 training images to available_images.md (#1906) * [tensorflow] [sagemaker] Add nginx timeout (#1893) Co-authored-by: Sai Parthasarathy Miduthuri * [pytorch][build] Upgrade EFA to 1.15.2 on PT 1.10 Training DLC (#1908) * Upgrade TS version to v0.6.0 (#1898) Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri * Release configuration for HF PT1.10 (#1909) * Release config changes HT PT1.10 training (#1910) * [release] Release PT 1.10.2 DLCs (#1929) * [pytorch][graviton][canary] Fix graviton canary DLC pull test (#1928) * Update pytorch-graviton pillow to 9.0.1 * Test Pytorch Graviton Pillow 9.0.1 docker update * get only graviton images for graviton ARCH * Typo on 948 __init__.py * test_pre_release.py * created better logic for graviton canary images * another parse_canary_images refactor for graviton * re-enable src module. * locking captum to 0.4.1 * reverting version on capton module * prep for merge * renable tests for PR verification * removing unused graviton_tag * put toml back to normal * [test] Disable dependency checks from canaries (#1873) * fix post processing for TF benchmark (#1905) * fix benchmark * revert dev config * [release] Add TF 2.8 DLCs to release images (#1935) * [tensorflow][build][e3] Bump TF version to 2.9.1 (#1930) * update: bump tf to 2.9.1 * fix: version number in buildspec Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [release] TF 2.9.1 release configuration (#1936) * TF 2.9.1 release configuration * add line * [pytorch][graviton][test] remove anaconda and fix TS 0.5.2 test timeout (#1932) * Release Pytorch-Graviton-E3 Image (#1940) * Update pytorch-graviton pillow to 9.0.1 * Test Pytorch Graviton Pillow 9.0.1 docker update * get only graviton images for graviton ARCH * Typo on 948 __init__.py * test_pre_release.py * created better logic for graviton canary images * another parse_canary_images refactor for graviton * re-enable src module. * locking captum to 0.4.1 * reverting version on capton module * prep for merge * renable tests for PR verification * removing unused graviton_tag * put toml back to normal * use conda-forge instead of anaconda. pillow update * removing local test paths * pytorch CPU into allow_openssl_cve_2022_1292 * fixed missing comma in dict * adding opencv to pip for 0.5.2 torchserve * prep for merge * updating pip pin for torchvision * test torchvision pip update * update toml for merge * release pytorch-graviton * [test] Fix dependency check test for AG 0.3 DLCs (#1945) * [test] Fix dependency check test for AG 0.3 DLCs * Correction: * Revert temp changes * [autogluon] [test] Fix AG 0.4 dependency check test (#1957) * [autogluon] Fix protobuf version issue (#1961) * update conda test to exclude .github directory (#1963) * [release] Add AG 0.4 to release spec (#1943) * [mxnet] [build] Get mxnet docker image building. (#1956) * Fix inference and security test for TF graviton (#1964) * build graviton tf image * install protobuf * adjust condition, run black * revert dev config * [release] Add AG 0.3.2 to release images (#1966) * [test] Disable anaconda test (#1965) * [mxnet] [build] Fix mxnet gpu inference build (#1967) * TF2.7 Graviton release (#1969) * [release] Add MX 1.9 to release images (#1970) * [test] Fix test failures on Neuron SDK 1.17.1 DLCs (#1959) Co-authored-by: Venky Natham * [TF][NEURON][BUILD] - For sdk 1.17.1 release use the right tf 2.5.2 version in tag (#1973) * Fix sanity test for TF2.7 (#1955) * Fix sanity test for TF2.7 * fix 2.7.0 inference * revert change * install protobuf * allowlist dep check * fix nvidia env variable * build inference image only * build and test training * revert temp configs * [release] Add Neuron SDK 1.17.1 DLCs to release images (#1972) * [release] Remove released images from release_images (#1974) * [Pytorch][Build][Test] Fix PT1.9.1 training vuls (#1948) * Fix PT1.9.1 training vuls * Make safety fixes * Add pip numpy version >=1.22.2 * Added the attrs=20.3.0 version required by sagemaker 2.92.1 * Uninstall and install attrs * Downgrade sm pysdk * Pin pillow>=9.1.1 * Fix Protobuf * Revert numpy pinning * Change Conda numpy versioning * Numba pin to 0.55.2 * Using numpy==1.19.1 * add multiprocessing start method to spawn * Using numpy-1.22.3 after test fix * datetime tag as false * Added protobuf to prevent sagemaker failures * Preventing new builds, skipping test_smmodelparallel_mnist_multigpu_singlenode for PT1.9 * Taking the smdataparallel_mnist.py from old commit b87590af021a4c3913b3669fa5f9fc10fbebe4e8 * Minor fix * Reverting the smdataparallel_smmodelparallel_mnist_script_mode.sh to commit b87590af021a4c3913b3669fa5f9fc10fbebe4e8, link: https://github.com/aws/deep-learning-containers/blame/b87590af021a4c3913b3669fa5f9fc10fbebe4e8/test/sagemaker_tests/pytorch/training/resources/mnist/smdataparallel_smmodelparallel_mnist_script_mode.sh * Running for release_candidate_integration tests * Making the scripts backward compatible * Making script smdataparallel_mnist backward compatible * Add Speciferset instead of versions * Reverting temp changes Co-authored-by: Daiming Yang * [Huggingface-Pytorch][Inference] Fix HF PT 1.9.1 Inference images (#1971) * HF PT 1.9.1 Inference images * Fixing urllib3 * Adding libopenblas-dev * datetime flag as false and run SM tests * Skipping speech and vision models * Revert temp changes * [huggingface_pytorch] Fix HF PT1.9 pipeline sm local test failures (#1978) * Fix HF PT1.9 pipeline sm local test failures * Add protobuf * Skip frameworks * Revert temp changes * Release huggingface_pytorch 1.9.1 inference images (#1980) * skip safety failure for TF estimator (#1981) * [pytorch][graviton][build] Upgrade torchserve to 0.5.3 (#1982) * [pt-graviton] Upgrade torchserve to 0.5.3 * Prep for merge * [Release][PyTorch][Graviton] upgrade torchserve to 0.5.3 (#1989) * [pt-graviton] Upgrade torchserve to 0.5.3 * Prep for merge * [Release][Graviton] upgrade torchserve to 0.5.3 * Release TF 2.7 e3 and sagemaker images (#1988) * Release TF 2.7 e3 and sagemaker images * Update release_images.yml * [Habana][Pytorch] Fix Habana SynAI1.2 PT1.10.0 (#1952) * Fix Habana SynAI1.2 PT1.10.0 * Changed the test ami ids * Reverting the ami id change as it is not reqd * update conda test and 1292 cve allow list * turn on benchmark mode * fix python version typo * update build spec * revert dev config * update pytorch-lightning * revert dev_config Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan * [Tensorflow-Habana][Build][Test] Fix Habana SynAI1.2 TF2.7 images (#1950) * Fix Habana SynAI1.2 TF2.7 images * Changing the Habana Ami Ids * Reverting the ami id change as it is not reqd * update 1292 cve allow list * turn on benchmark mode * using Habana base v1.2 ami to rerun the SM test * update buildspec name * revert dev config * using synAI1.3 AMI * revert dev config * revert buildspec Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan * [Pytorch][test] Fixing Pt-1.9 SM failures (#1976) * Fixing Pt-19 SM failures * Change file to the older commit(https://github.com/aws/deep-learning-containers/blob/b87590af021a4c3913b3669fa5f9fc10fbebe4e8/test/sagemaker_tests/pytorch/training/resources/smdataparallel/smdataparallel_throughput.py) * Adding version split based on SMDDP team's suggestion to run smdataparallel_throughput_post_ptbackend * Removed incorrect line to test normal working * Revert temp changes * [autogluon][build] AutoGluon 0.4.2 container (#1992) * [tensorflow][build][test] Fix TF 2.6 test errors (#1960) * [tensorflow] [test] Fix TF 2.6 dependency check * Build * Fix * Disable builds and test only CPU Training * Build and test only training, fix dataservice * Run only inference * Install protobuf on client during inference test * Fix GPU dockerfile * Add Training DLCs back to buildspec * Add 48551 to safety check allowlist * Revert config and buildspec temp changes * [Pytorch][Release] Release PT1.9.1 training images (#1993) * Release PT1.9.1 training images * Add space * [pytorch][graviton][test][build] fix pt-graviton conda build failure (#1991) * [pt-graviton] Upgrade torchserve to 0.5.3 * Prep for merge * [Release][Graviton] upgrade torchserve to 0.5.3 * find out why pt-graviton release will not build * changing mamba version to 4.11 for test * going to 4.12.2 for Mamba * mamba version to 4.12.0-2 * running without updating conda itself * move update to after installs keeping conda 4.12 * prep for merge and RC * update comment on conda update move * [release] Add AG 0.4.2 to release_images (#1994) * [release] Add TF 2.6 to release_images (#1995) * [Pytorch][Inference] Fix PT 1.11 Inference Images (#1987) * Change buildspec to include inference only * Upgrade mamba version * downgrade mamba version * Fetch ami dynamically * Upgrade mamba version * Fix error * Fix errors in cu113 * Changes from review * Revert Changes and add comments * Add standard labels to DLCs (#1896) * [Autogluon][doc] AutoGluon 0.4.2 available images update (#1997) * [tensorflow]|[build]|[sagemaker] Add Sagemaker builds for Tensorflow 2.9 (#1858) * release(tf): Add E3 dockerfiles for TF2.9 * fix: Drop experimental from tf.keras.mixed_precision APIs in tests * fix: more fixes for the dropped experimental APIs * update: Use rc1 binaries * temp: update skip_frameworks for tests * feat: add sagemaker features for TF2.9 Dockerfiles * feat: add sm images to buildspec * update: Use newer SM Debug tags * tests: Use "standard" sagemaker tests * fix: Update some install commands * temp: remove sagemaker-tensorflow-extensions and data parallel libs * fix: replace expired nvidia gpg keys * Update Dockerfile.gpu * fix: update horovod version and tf url * update: Bump EFA and NCCL-OFI version * Revert "fix: replace expired nvidia gpg keys" This reverts commit 28efe73013b5169ee973ebb9eb9b61a087e35804. * update: smdebug repo tag * update: use rc2 wheels * fix: add smdataparallel ld lib path * temp: try bumping cudnn version * feat: add smdataparallel * update: Add TF model garden and tf-text * update: adding exploratory tests for SM Training Compiler integration * fix: syntax error in trcomp test * fix: syntax error in trcomp testeiifccrchutljrvcrulhjugfttdggrejifitdtttfbig * update: SM Dockerfile to use released version of tf-models instead of nightly * temp: Revert smtrcomp test commits This reverts commit a50936d0d7064fa99b3127ba8e6a7606845f188a. * update: use GA release wheels * Revert "temp: Revert smtrcomp test commits" This reverts commit 42992a2f385fa7469b6192f846953ae3333d9716. * Fix: fixing trcomp tests * Fix: fixing missing imports in trcomp tests and adding skip markers as appropriate * update: temporarily removing tf-model and tf-text to debug telemetry issues * fix: for https://github.com/tensorflow/models/issues/9267 * fix: Typo in Dockerfile * fix: syntax error in trcomp tests * fix: fix interactive commands in Dockerfile * update: increasing image size limit * fix: fixing skip errors in trcomp tests * fix: fixing trcomp integration tests * fix: fixing trcomp integration tests * fix: fixing trcomp integration tests * Fix: fixing trcomp integ tests * fix: install sm_tf with abi=1 from test repo * fix: remove unnecessary git checkout * feat: add sagemaker-tensorflow from source * update: use 2.9.1 wheels * temp: revert back to test fork for pipemode * marking broken trcomp tests as xfail * marking horovod and SMDP + trcomp tests as xfail * Revert "temp: revert back to test fork for pipemode" This reverts commit 54fd4fa3540369de452204c3f5417fd09e55688c. * fix: rectify xfail marker * fix: add reason kwarg to xfail annotations * run SM benchmark test * Adding more comments for readability * Adding new performance benchmark for trcomp * Skipping older xla tests in favor of trcomp tests. Fixing new trcomp tests * Fixing trcomp benchmarks * Skipping trcomp benchmarks for cpu * no-op commit to trigger re-build and tests * tests: run rc tests * tests: run efa tests * tests: disable benchmarks * tests: run rc tests again * tests: run local tests * fix: pin protobuf to fix sanity tests * fix: add tf models pip check exception to allowlist * fix: pin sagemaker and sagemaker tt versions * tests: run sm standard tests with latest changes * fix: typo in documentation * Update dlc_developer_config.toml * fix: reverting unused changes * fix: logic error in version comparison in smtrcomp tests * update: maks TF_URL common param * revert previous commit and just replace the new wheel file This reverts commit fe1c3818d2b2c3f9ca141b9425ccb140f43365e3. * update: sm tf wheel * update: e3 tf url * Promoting variable reuse in trcomp tests * change: removing stale benchmarks * revert tomlfile changes * change: adding more markers to trcomp tests * change: adding more markers to trcomp tests Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Co-authored-by: Loki Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * Fix typo in available_images markdown file (#2005) * [Tensorflow-Habana][Build][Test] Fix Habana SynAI1.2 TF2.7 images (#2003) * init build * update safety ignore ids * revert dev config * Pytorch 1.11 inference buildspec file changes (#1996) * Buildspec file changes * Changes to handle mamba version for training images * Changes to dlc congif file * Uncomment sagemaker inference images in buildspec * Revert changes for training * Revert cpu training docker * Revert change for training gpu * Removing extra RUN to avoid new image layer * Cuda version change for sagemaker inference * Revert cuda version change * Check for errors * Removing the pinned version for cu113 * Revert dlc_developer_config changes * [release][habana] release habana synai1.2 PT1.10 and TF2.7 images (#2004) * release habana synai1.2 PT1.0 images * update tensorflow releae images * update Habana SynAI1.2 release pipeline pytorch version (#2007) * release TF2.9 SM training images (#2008) * [build][Tensorflow] HF TF2.6 GPU training init build (#1986) * init build * updte safety scan ids * revert dev config * Release Pt 1.11 inference (#2010) * update available_images.md with TF2.9.1 training (#2009) * update available_images.md with TF2.9.1 trcomp training images (#2011) Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [build][Tensorflow] Hf TF26 cpu gpu inference (#1985) * init build * build HF TF2.6 inference * update ullib package * revert dev config * [tensorflow] [build] Fix TF 2.5 docker image build. (#1958) * Update to manually pull in debs from old repo. * Temp toml changes. * Force TF 2.5 builds * Fix dependency check * Fix safety check and pip check issues * Add exception for TF 2.5 * Pin protobuf to fix horovod compilation. * Pin protobuf installation. * Fix urllib3 version issue. * Bump requests package. * Align how horovod is installed with other versions * Modify NVIDIA_REQUIRE_CUDA to get SM Endpoints working. * Bring NVIDIA_REQUIRE_CUDA change to diy file. * Install protobuf in the host instance for the dataservice test and the inference test. * Fix duplicate key. * Add tensorflow-datasets to dockerfile * Bring in 48551 to safety check allowlist. * update tf2.5.3 wheels * add arch_type * remove openssl installed from source on cpu image * change ami id * revert temp changes * change inf instance type * build inference only * fix nit issue * revert temp change * add configs back * build cpu training image * run sm * revert temp change Co-authored-by: Sai Parthasarathy Miduthuri Co-authored-by: Wei Chu Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [pytorch] [test] [sagemaker] Fix failing test by setting mp start method. (#1941) * release TF2.5.3 training and TF2.5.1 inference (#2016) * [HF][build][test] add Huggingface Training compiler infra (#1857) * Merge from PRIVATE @fe8cbde * set dev config * dummy commit * revert dummy commit * dummy commit * revert dummy commit * remove some change conflicts * fix auto merge error * resolve Nvidia GPG issue * revert dev config * revert dev config * address comments * update dev config * udpate dev config * update tf public binary * update pt docker file * update tensorflow numpy * fix typo * update * update tf2.6 * add contributor and arch type * revert dev config * revert dev config * address comment * fix merge * set dev config * dummy commit * revert dummy commit * revert dev config * address comments Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * fix intendation (#2020) * [pytorch][build] Upgrade EFA to 1.15.2 on PT 1.11 Training GPU DLC (#1888) * release HF tf2.6 (#2019) * [build] Allow buildspec override from config (#1939) * [pytorch][test][build] Fix for PT 1.10 SM GPU Inference (#1999) * [pt-graviton] Upgrade torchserve to 0.5.3 * Prep for merge * [Release][Graviton] upgrade torchserve to 0.5.3 * find out why pt-graviton release will not build * changing mamba version to 4.11 for test * going to 4.12.2 for Mamba * mamba version to 4.12.0-2 * running without updating conda itself * move update to after installs keeping conda 4.12 * prep for merge and RC * update comment on conda update move * Testing docker cu111 env update * testing SM with EFA * Commenting update and validating before PR review * Prep for PR review * setup buildspec to test pt 1.10 * using TP 1.10 buildspec for test * create Dockerfile.cpu for pt 1.10 for build fail * Revert testing files and prep for review * [Huggingface][Build][Test][Tensorflow] HF Tensorflow 2.5 CPU/GPU inference (#1954) * HF Tensorflow 2.5 GPU * update dev config * update * build inference image first * update urllib3 * build cpu and gpu inference * update urllib3 through pip * update urllib and mms * remove vision_model test as vision support with hf inference tookit is introduced * update safety_scan ignore ids * fix typo * revert dev config * update buildspec * update TF version to 2.5.3 * dummy commit * revert dummy commit * update urllib3 * revert dev config * [release] Fix release buildspec testing after src config changes (#2024) * [build][pytorch][inference] Update PT 1.11 inference opencv version (#2022) * fix: Logic error in SageMaker Training Compiler tests (#2015) Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [Huggingface][Build][Test][Tensorflow] update HF tf2.5 images (#1962) * build HF tf2.5 training image * update dev config * resolve Nvidia GPG key issue * update build * revert Nvidia GPG short term solution * update docker files * update dev config * revert dev config * revert dev config * add release_image * [tensorflow][build]: Add TF2.9 Inference Dockerfiles (#2023) * add e3 and sm cpu/gpu dockerfiles * tests: update toml file * tests: update buildspec * run sagemaker test * update: use 2.9 wheels * tests: run rc tests * tests: run efa tests * fix: image baseline * rc tests again * standard tests again * fix: add omp * temp: test potential fix to missing libiomp5.so * run rc tests again * run efa again * update: cpu model server url * Revert "temp: test potential fix to missing libiomp5.so" This reverts commit b6cdf19c4e57565e69a3206d744f6fe2bbd371c6. * Revert "fix: add omp" This reverts commit 0bbddaffa91073d9da9f5b194d7f4cae81d7c4a5. * tests: standard again * trigger build * run benchmark test * revert dev config Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * Fetch ami-id dynamically for tests (#1882) * release TF2.9.1 inference and PT1.11 E3/SM images (#2034) * relesdt TF2.9.1 inference images * release PT1.11 images as well * Fix assert statements in TFS test (#2035) * Fix assert statements in TFS test * Update test_pre_release.py * [build] TF2.9.0 Inference DLC (#2037) * build 2.9.0 images * release 2.9.0 images * update available_images.md with TF2.9.0 inference DLC (#2038) * add 2.9.1 inference * toggle line comment * update 2.9.0 * update sm image * [pytorch] [trcomp] [huggingface] add trcomp 1.9 dockerfile with latest CVE fixes (#2027) * [build][tensorflow]TF 2.6 training for CVE fixes (#2030) * TF 2.6 opencv version update * increase image baseline * Change patch version and build inference * format * skip test for tf2.6.3 inference * uncomment TF2.6 inference * revert temp changes * revert buildspec Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * Release hf pytorch trcomp 1.9 images (#2043) * TF2.6 image baseline increase fix (#2044) * Build PT1.9 inference (#1975) * Build PT1.9 inference * skip torchaudio test for PT1.9 * Revert Conda changes and comment NVIDIA_REQUIRE_CUDA * Added mkl library * Added command to install mkl library * Removed mkl apt install. Added mkl pip install. * skip audio test for 1.9.x; lower bound urllib3 version * skip cpu audio test * Change docker file based on fix for conda-forge * Update Dockerfile.cpu * Update Dockerfile.cpu * Update Dockerfile.gp… * [NEURON][PT][TRAINING] - First version of PT training container (#386) * Update release_images.yml For hf neuron for the time being have disable_sm_tag to True * Update release_images.yml * training DLC's Signed-off-by: Venky Natham * create py3 Signed-off-by: Venky Natham * use python3.6 Signed-off-by: Venky Natham * Add neuron collectives. This docker image was tested with multi node all reduce on trn1 instance and works fine Signed-off-by: Venky Natham * Create copies of collectives and nccom lib Signed-off-by: Venky Natham * do neuron test/build Signed-off-by: Venky Natham * Fix the buildspecs for neuron Signed-off-by: Venky Natham * Increase the size Signed-off-by: Venky Natham * revert enable neuron test Signed-off-by: Venky Natham * Add basic ec2 test for training Signed-off-by: Venky Natham * enable neuron test Signed-off-by: Venky Natham * Not build inference image for now Signed-off-by: Venky Natham * Use the right region for launch of trn1 instance Signed-off-by: Venky Natham * Run the training test from default region Signed-off-by: Venky Natham * change sdk version to tmp 2.1.1 Signed-off-by: Venky Natham * Update dlc_developer_config.toml * Update to use the beta repo and also py37 Signed-off-by: Venky Natham * Fix the repo path Signed-off-by: Venky Natham * fix the pip repo path Signed-off-by: Venky Natham * Fix version manifest Signed-off-by: Venky Natham * fix the neuron version manifest Signed-off-by: Venky Natham * Fix the py version in buildspec Signed-off-by: Venky Natham * add ignore some safety checks Signed-off-by: Venky Natham * Remove tf2.6 install Signed-off-by: Venky Natham * Fix the ignore safety check file Signed-off-by: Venky Natham * remove unwanted files Signed-off-by: Venky Natham * fix the safety_scan json file Signed-off-by: Venky Natham * fix test failures Signed-off-by: Venky Natham * add some extra package build Signed-off-by: Venky Natham * Use the right ami for neuron test Signed-off-by: Venky Natham * remove smdebug/smclarify Signed-off-by: Venky Natham * fix the pre release vulnerability Signed-off-by: Venky Natham * remove sagemaker-experiments Signed-off-by: Venky Natham * Update dlc_developer_config.toml * Update test_pytorch_training.py Signed-off-by: Venky Natham Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [NEURON][PT][TRAINING] - Add the CVE to allowlist for neuron (#446) * Update release_images.yml For hf neuron for the time being have disable_sm_tag to True * Update release_images.yml * training DLC's Signed-off-by: Venky Natham * create py3 Signed-off-by: Venky Natham * use python3.6 Signed-off-by: Venky Natham * Add neuron collectives. This docker image was tested with multi node all reduce on trn1 instance and works fine Signed-off-by: Venky Natham * Create copies of collectives and nccom lib Signed-off-by: Venky Natham * do neuron test/build Signed-off-by: Venky Natham * Fix the buildspecs for neuron Signed-off-by: Venky Natham * Increase the size Signed-off-by: Venky Natham * revert enable neuron test Signed-off-by: Venky Natham * Add basic ec2 test for training Signed-off-by: Venky Natham * enable neuron test Signed-off-by: Venky Natham * Not build inference image for now Signed-off-by: Venky Natham * Use the right region for launch of trn1 instance Signed-off-by: Venky Natham * Run the training test from default region Signed-off-by: Venky Natham * change sdk version to tmp 2.1.1 Signed-off-by: Venky Natham * Update dlc_developer_config.toml * Update to use the beta repo and also py37 Signed-off-by: Venky Natham * Fix the repo path Signed-off-by: Venky Natham * fix the pip repo path Signed-off-by: Venky Natham * Fix version manifest Signed-off-by: Venky Natham * fix the neuron version manifest Signed-off-by: Venky Natham * Fix the py version in buildspec Signed-off-by: Venky Natham * add ignore some safety checks Signed-off-by: Venky Natham * Remove tf2.6 install Signed-off-by: Venky Natham * Fix the ignore safety check file Signed-off-by: Venky Natham * remove unwanted files Signed-off-by: Venky Natham * fix the safety_scan json file Signed-off-by: Venky Natham * fix test failures Signed-off-by: Venky Natham * add some extra package build Signed-off-by: Venky Natham * Use the right ami for neuron test Signed-off-by: Venky Natham * remove smdebug/smclarify Signed-off-by: Venky Natham * fix the pre release vulnerability Signed-off-by: Venky Natham * remove sagemaker-experiments Signed-off-by: Venky Natham * Update dlc_developer_config.toml * Update test_pytorch_training.py * fix the cve for prerelease Signed-off-by: Venky Natham Signed-off-by: Venky Natham Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * Add fake release for PT 1.9.1 ec2 image (#452) * Add graviton for mock release (#453) (#454) * Update release pipeline for neuron (#462) * Update release pipeline for neuron * Update release_images.yml * Bump tensorflow in /test/sagemaker_tests/huggingface_tensorflow/training (#370) Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.5.3 to 2.7.2. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.5.3...v2.7.2) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> * [NEURON][PT][TRAIN] - Add PT 1.11 and U20 container for training (#467) * Update release_images.yml For hf neuron for the time being have disable_sm_tag to True * Update release_images.yml * training DLC's Signed-off-by: Venky Natham * create py3 Signed-off-by: Venky Natham * use python3.6 Signed-off-by: Venky Natham * Add neuron collectives. This docker image was tested with multi node all reduce on trn1 instance and works fine Signed-off-by: Venky Natham * Create copies of collectives and nccom lib Signed-off-by: Venky Natham * do neuron test/build Signed-off-by: Venky Natham * Fix the buildspecs for neuron Signed-off-by: Venky Natham * Increase the size Signed-off-by: Venky Natham * revert enable neuron test Signed-off-by: Venky Natham * Add basic ec2 test for training Signed-off-by: Venky Natham * enable neuron test Signed-off-by: Venky Natham * Not build inference image for now Signed-off-by: Venky Natham * Use the right region for launch of trn1 instance Signed-off-by: Venky Natham * Run the training test from default region Signed-off-by: Venky Natham * change sdk version to tmp 2.1.1 Signed-off-by: Venky Natham * Update dlc_developer_config.toml * Update to use the beta repo and also py37 Signed-off-by: Venky Natham * Fix the repo path Signed-off-by: Venky Natham * fix the pip repo path Signed-off-by: Venky Natham * Fix version manifest Signed-off-by: Venky Natham * fix the neuron version manifest Signed-off-by: Venky Natham * Fix the py version in buildspec Signed-off-by: Venky Natham * add ignore some safety checks Signed-off-by: Venky Natham * Remove tf2.6 install Signed-off-by: Venky Natham * Fix the ignore safety check file Signed-off-by: Venky Natham * remove unwanted files Signed-off-by: Venky Natham * fix the safety_scan json file Signed-off-by: Venky Natham * fix test failures Signed-off-by: Venky Natham * add some extra package build Signed-off-by: Venky Natham * Use the right ami for neuron test Signed-off-by: Venky Natham * remove smdebug/smclarify Signed-off-by: Venky Natham * fix the pre release vulnerability Signed-off-by: Venky Natham * remove sagemaker-experiments Signed-off-by: Venky Natham * Update dlc_developer_config.toml * Update test_pytorch_training.py * fix the cve for prerelease Signed-off-by: Venky Natham * Add 1.11 dockerfile Signed-off-by: Venky Natham * Change test code for pt1.11 Signed-off-by: Venky Natham * Enable neuron test Signed-off-by: Venky Natham * Fix the ld library path Signed-off-by: Venky Natham * Increase image size Signed-off-by: Venky Natham * Increase image size to 10G Signed-off-by: Venky Natham * Use droplet instead of prod Signed-off-by: Venky Natham * use the latest mvp3 rc1 AMI Signed-off-by: Venky Natham * Increase vol size Signed-off-by: Venky Natham * Add pt1.11 neuron for pre release check Signed-off-by: Venky Natham * Fix the safety check Signed-off-by: Venky Natham * Fix the syntax issue in last commit Signed-off-by: Venky Natham * Fix the safety id Signed-off-by: Venky Natham * Use prod trn1s Signed-off-by: Venky Natham * Remove the droplet Signed-off-by: Venky Natham * Enable SM tests Signed-off-by: Venky Natham * Skip sm test on neuron trn1 Signed-off-by: Venky Natham * Update dlc_developer_config.toml * Update dlc_developer_config.toml Signed-off-by: Venky Natham Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * Add support for mainline trn1 instances (#482) * Add cryptography to ignore_ids_list (#483) * Add cryptography to ignore_ids * Update dlc_developer_config * Fix issue * Trigger the sanity tests * Trigger builds * Update buildspec-neuron.yml * Update Dockerfile.neuron * Add numpy to docker file * Revert config file Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * public code sync (#486) * Release huggingface_pytorch 1.9.1 inference images (#1980) * skip safety failure for TF estimator (#1981) * [pytorch][graviton][build] Upgrade torchserve to 0.5.3 (#1982) * [pt-graviton] Upgrade torchserve to 0.5.3 * Prep for merge * [Release][PyTorch][Graviton] upgrade torchserve to 0.5.3 (#1989) * [pt-graviton] Upgrade torchserve to 0.5.3 * Prep for merge * [Release][Graviton] upgrade torchserve to 0.5.3 * Release TF 2.7 e3 and sagemaker images (#1988) * Release TF 2.7 e3 and sagemaker images * Update release_images.yml * [Habana][Pytorch] Fix Habana SynAI1.2 PT1.10.0 (#1952) * Fix Habana SynAI1.2 PT1.10.0 * Changed the test ami ids * Reverting the ami id change as it is not reqd * update conda test and 1292 cve allow list * turn on benchmark mode * fix python version typo * update build spec * revert dev config * update pytorch-lightning * revert dev_config Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan * [Tensorflow-Habana][Build][Test] Fix Habana SynAI1.2 TF2.7 images (#1950) * Fix Habana SynAI1.2 TF2.7 images * Changing the Habana Ami Ids * Reverting the ami id change as it is not reqd * update 1292 cve allow list * turn on benchmark mode * using Habana base v1.2 ami to rerun the SM test * update buildspec name * revert dev config * using synAI1.3 AMI * revert dev config * revert buildspec Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan * [Pytorch][test] Fixing Pt-1.9 SM failures (#1976) * Fixing Pt-19 SM failures * Change file to the older commit(https://github.com/aws/deep-learning-containers/blob/b87590af021a4c3913b3669fa5f9fc10fbebe4e8/test/sagemaker_tests/pytorch/training/resources/smdataparallel/smdataparallel_throughput.py) * Adding version split based on SMDDP team's suggestion to run smdataparallel_throughput_post_ptbackend * Removed incorrect line to test normal working * Revert temp changes * [autogluon][build] AutoGluon 0.4.2 container (#1992) * [tensorflow][build][test] Fix TF 2.6 test errors (#1960) * [tensorflow] [test] Fix TF 2.6 dependency check * Build * Fix * Disable builds and test only CPU Training * Build and test only training, fix dataservice * Run only inference * Install protobuf on client during inference test * Fix GPU dockerfile * Add Training DLCs back to buildspec * Add 48551 to safety check allowlist * Revert config and buildspec temp changes * [Pytorch][Release] Release PT1.9.1 training images (#1993) * Release PT1.9.1 training images * Add space * [pytorch][graviton][test][build] fix pt-graviton conda build failure (#1991) * [pt-graviton] Upgrade torchserve to 0.5.3 * Prep for merge * [Release][Graviton] upgrade torchserve to 0.5.3 * find out why pt-graviton release will not build * changing mamba version to 4.11 for test * going to 4.12.2 for Mamba * mamba version to 4.12.0-2 * running without updating conda itself * move update to after installs keeping conda 4.12 * prep for merge and RC * update comment on conda update move * [release] Add AG 0.4.2 to release_images (#1994) * [release] Add TF 2.6 to release_images (#1995) * [Pytorch][Inference] Fix PT 1.11 Inference Images (#1987) * Change buildspec to include inference only * Upgrade mamba version * downgrade mamba version * Fetch ami dynamically * Upgrade mamba version * Fix error * Fix errors in cu113 * Changes from review * Revert Changes and add comments * Add standard labels to DLCs (#1896) * [Autogluon][doc] AutoGluon 0.4.2 available images update (#1997) * [tensorflow]|[build]|[sagemaker] Add Sagemaker builds for Tensorflow 2.9 (#1858) * release(tf): Add E3 dockerfiles for TF2.9 * fix: Drop experimental from tf.keras.mixed_precision APIs in tests * fix: more fixes for the dropped experimental APIs * update: Use rc1 binaries * temp: update skip_frameworks for tests * feat: add sagemaker features for TF2.9 Dockerfiles * feat: add sm images to buildspec * update: Use newer SM Debug tags * tests: Use "standard" sagemaker tests * fix: Update some install commands * temp: remove sagemaker-tensorflow-extensions and data parallel libs * fix: replace expired nvidia gpg keys * Update Dockerfile.gpu * fix: update horovod version and tf url * update: Bump EFA and NCCL-OFI version * Revert "fix: replace expired nvidia gpg keys" This reverts commit 28efe73013b5169ee973ebb9eb9b61a087e35804. * update: smdebug repo tag * update: use rc2 wheels * fix: add smdataparallel ld lib path * temp: try bumping cudnn version * feat: add smdataparallel * update: Add TF model garden and tf-text * update: adding exploratory tests for SM Training Compiler integration * fix: syntax error in trcomp test * fix: syntax error in trcomp testeiifccrchutljrvcrulhjugfttdggrejifitdtttfbig * update: SM Dockerfile to use released version of tf-models instead of nightly * temp: Revert smtrcomp test commits This reverts commit a50936d0d7064fa99b3127ba8e6a7606845f188a. * update: use GA release wheels * Revert "temp: Revert smtrcomp test commits" This reverts commit 42992a2f385fa7469b6192f846953ae3333d9716. * Fix: fixing trcomp tests * Fix: fixing missing imports in trcomp tests and adding skip markers as appropriate * update: temporarily removing tf-model and tf-text to debug telemetry issues * fix: for https://github.com/tensorflow/models/issues/9267 * fix: Typo in Dockerfile * fix: syntax error in trcomp tests * fix: fix interactive commands in Dockerfile * update: increasing image size limit * fix: fixing skip errors in trcomp tests * fix: fixing trcomp integration tests * fix: fixing trcomp integration tests * fix: fixing trcomp integration tests * Fix: fixing trcomp integ tests * fix: install sm_tf with abi=1 from test repo * fix: remove unnecessary git checkout * feat: add sagemaker-tensorflow from source * update: use 2.9.1 wheels * temp: revert back to test fork for pipemode * marking broken trcomp tests as xfail * marking horovod and SMDP + trcomp tests as xfail * Revert "temp: revert back to test fork for pipemode" This reverts commit 54fd4fa3540369de452204c3f5417fd09e55688c. * fix: rectify xfail marker * fix: add reason kwarg to xfail annotations * run SM benchmark test * Adding more comments for readability * Adding new performance benchmark for trcomp * Skipping older xla tests in favor of trcomp tests. Fixing new trcomp tests * Fixing trcomp benchmarks * Skipping trcomp benchmarks for cpu * no-op commit to trigger re-build and tests * tests: run rc tests * tests: run efa tests * tests: disable benchmarks * tests: run rc tests again * tests: run local tests * fix: pin protobuf to fix sanity tests * fix: add tf models pip check exception to allowlist * fix: pin sagemaker and sagemaker tt versions * tests: run sm standard tests with latest changes * fix: typo in documentation * Update dlc_developer_config.toml * fix: reverting unused changes * fix: logic error in version comparison in smtrcomp tests * update: maks TF_URL common param * revert previous commit and just replace the new wheel file This reverts commit fe1c3818d2b2c3f9ca141b9425ccb140f43365e3. * update: sm tf wheel * update: e3 tf url * Promoting variable reuse in trcomp tests * change: removing stale benchmarks * revert tomlfile changes * change: adding more markers to trcomp tests * change: adding more markers to trcomp tests Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Co-authored-by: Loki Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * Fix typo in available_images markdown file (#2005) * [Tensorflow-Habana][Build][Test] Fix Habana SynAI1.2 TF2.7 images (#2003) * init build * update safety ignore ids * revert dev config * Pytorch 1.11 inference buildspec file changes (#1996) * Buildspec file changes * Changes to handle mamba version for training images * Changes to dlc congif file * Uncomment sagemaker inference images in buildspec * Revert changes for training * Revert cpu training docker * Revert change for training gpu * Removing extra RUN to avoid new image layer * Cuda version change for sagemaker inference * Revert cuda version change * Check for errors * Removing the pinned version for cu113 * Revert dlc_developer_config changes * [release][habana] release habana synai1.2 PT1.10 and TF2.7 images (#2004) * release habana synai1.2 PT1.0 images * update tensorflow releae images * update Habana SynAI1.2 release pipeline pytorch version (#2007) * release TF2.9 SM training images (#2008) * [build][Tensorflow] HF TF2.6 GPU training init build (#1986) * init build * updte safety scan ids * revert dev config * Release Pt 1.11 inference (#2010) * update available_images.md with TF2.9.1 training (#2009) * update available_images.md with TF2.9.1 trcomp training images (#2011) Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [build][Tensorflow] Hf TF26 cpu gpu inference (#1985) * init build * build HF TF2.6 inference * update ullib package * revert dev config * [tensorflow] [build] Fix TF 2.5 docker image build. (#1958) * Update to manually pull in debs from old repo. * Temp toml changes. * Force TF 2.5 builds * Fix dependency check * Fix safety check and pip check issues * Add exception for TF 2.5 * Pin protobuf to fix horovod compilation. * Pin protobuf installation. * Fix urllib3 version issue. * Bump requests package. * Align how horovod is installed with other versions * Modify NVIDIA_REQUIRE_CUDA to get SM Endpoints working. * Bring NVIDIA_REQUIRE_CUDA change to diy file. * Install protobuf in the host instance for the dataservice test and the inference test. * Fix duplicate key. * Add tensorflow-datasets to dockerfile * Bring in 48551 to safety check allowlist. * update tf2.5.3 wheels * add arch_type * remove openssl installed from source on cpu image * change ami id * revert temp changes * change inf instance type * build inference only * fix nit issue * revert temp change * add configs back * build cpu training image * run sm * revert temp change Co-authored-by: Sai Parthasarathy Miduthuri Co-authored-by: Wei Chu Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [pytorch] [test] [sagemaker] Fix failing test by setting mp start method. (#1941) * release TF2.5.3 training and TF2.5.1 inference (#2016) * [HF][build][test] add Huggingface Training compiler infra (#1857) * Merge from PRIVATE @fe8cbde * set dev config * dummy commit * revert dummy commit * dummy commit * revert dummy commit * remove some change conflicts * fix auto merge error * resolve Nvidia GPG issue * revert dev config * revert dev config * address comments * update dev config * udpate dev config * update tf public binary * update pt docker file * update tensorflow numpy * fix typo * update * update tf2.6 * add contributor and arch type * revert dev config * revert dev config * address comment * fix merge * set dev config * dummy commit * revert dummy commit * revert dev config * address comments Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * fix intendation (#2020) * [pytorch][build] Upgrade EFA to 1.15.2 on PT 1.11 Training GPU DLC (#1888) * release HF tf2.6 (#2019) * [build] Allow buildspec override from config (#1939) * [pytorch][test][build] Fix for PT 1.10 SM GPU Inference (#1999) * [pt-graviton] Upgrade torchserve to 0.5.3 * Prep for merge * [Release][Graviton] upgrade torchserve to 0.5.3 * find out why pt-graviton release will not build * changing mamba version to 4.11 for test * going to 4.12.2 for Mamba * mamba version to 4.12.0-2 * running without updating conda itself * move update to after installs keeping conda 4.12 * prep for merge and RC * update comment on conda update move * Testing docker cu111 env update * testing SM with EFA * Commenting update and validating before PR review * Prep for PR review * setup buildspec to test pt 1.10 * using TP 1.10 buildspec for test * create Dockerfile.cpu for pt 1.10 for build fail * Revert testing files and prep for review * [Huggingface][Build][Test][Tensorflow] HF Tensorflow 2.5 CPU/GPU inference (#1954) * HF Tensorflow 2.5 GPU * update dev config * update * build inference image first * update urllib3 * build cpu and gpu inference * update urllib3 through pip * update urllib and mms * remove vision_model test as vision support with hf inference tookit is introduced * update safety_scan ignore ids * fix typo * revert dev config * update buildspec * update TF version to 2.5.3 * dummy commit * revert dummy commit * update urllib3 * revert dev config * [release] Fix release buildspec testing after src config changes (#2024) * [build][pytorch][inference] Update PT 1.11 inference opencv version (#2022) * fix: Logic error in SageMaker Training Compiler tests (#2015) Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [Huggingface][Build][Test][Tensorflow] update HF tf2.5 images (#1962) * build HF tf2.5 training image * update dev config * resolve Nvidia GPG key issue * update build * revert Nvidia GPG short term solution * update docker files * update dev config * revert dev config * revert dev config * add release_image * [tensorflow][build]: Add TF2.9 Inference Dockerfiles (#2023) * add e3 and sm cpu/gpu dockerfiles * tests: update toml file * tests: update buildspec * run sagemaker test * update: use 2.9 wheels * tests: run rc tests * tests: run efa tests * fix: image baseline * rc tests again * standard tests again * fix: add omp * temp: test potential fix to missing libiomp5.so * run rc tests again * run efa again * update: cpu model server url * Revert "temp: test potential fix to missing libiomp5.so" This reverts commit b6cdf19c4e57565e69a3206d744f6fe2bbd371c6. * Revert "fix: add omp" This reverts commit 0bbddaffa91073d9da9f5b194d7f4cae81d7c4a5. * tests: standard again * trigger build * run benchmark test * revert dev config Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * Fetch ami-id dynamically for tests (#1882) * release TF2.9.1 inference and PT1.11 E3/SM images (#2034) * relesdt TF2.9.1 inference images * release PT1.11 images as well * Fix assert statements in TFS test (#2035) * Fix assert statements in TFS test * Update test_pre_release.py * [build] TF2.9.0 Inference DLC (#2037) * build 2.9.0 images * release 2.9.0 images * update available_images.md with TF2.9.0 inference DLC (#2038) * add 2.9.1 inference * toggle line comment * update 2.9.0 * update sm image * [pytorch] [trcomp] [huggingface] add trcomp 1.9 dockerfile with latest CVE fixes (#2027) * [build][tensorflow]TF 2.6 training for CVE fixes (#2030) * TF 2.6 opencv version update * increase image baseline * Change patch version and build inference * format * skip test for tf2.6.3 inference * uncomment TF2.6 inference * revert temp changes * revert buildspec Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * Release hf pytorch trcomp 1.9 images (#2043) * TF2.6 image baseline increase fix (#2044) * Build PT1.9 inference (#1975) * Build PT1.9 inference * skip torchaudio test for PT1.9 * Revert Conda changes and comment NVIDIA_REQUIRE_CUDA * Added mkl library * Added command to install mkl library * Removed mkl apt install. Added mkl pip install. * skip audio test for 1.9.x; lower bound urllib3 version * skip cpu audio test * Change docker file based on fix for conda-forge * Update Dockerfile.cpu * Update Dockerfile.cpu * Update Dockerfile.gpu * Revert "Update Dockerfile.gpu" This reverts commit bef27f537220f1fc7224c6668d89b6c9659ba416. * Revert "Update Dockerfile.cpu" This reverts commit 7f08d20ab5a2926a8b3a70f2468d2d9c44358e72. * Update Dockerfile.cpu * Update Dockerfile.gpu * Update buildspec.yml * Update Dockerfile.gpu * Update Dockerfile.cpu * Update buildspec.yml * Update Dockerfile.cpu * Update buildspec.yml * Update Dockerfile.gpu * Update Dockerfile.cpu * Update buildspec.yml * Update Dockerfile.cpu * Update Dockerfile.neuron * Update PT 1.9 buildspec * revert config * Update Dockerfile.gpu * Update Dockerfile.gpu * Update Dockerfile.cpu Co-authored-by: Shantanu Tripathi Co-authored-by: Kenneth Ezirim Co-authored-by: Wei Chu Co-authored-by: RadhikaB-97 Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * Release PT 1.9 inference images (#2045) * Release PT 1.9 inference images * Update release_images.yml * increase image_baseline (#2046) Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * Release mxnet 1.9 training for cve patching (#2051) * Re-release for mxnet 1.9 training * Add example image configuration * [build][tensorflow]TF 2.7 training for CVE fixes (#2036) * Opencv fix * Update tf version * revert changes * Habana DLC 1.4.1 Release (#1900) * manually apply and check all the 1.4.1 changes * add contributor and arch_type to yml * manually merge from master 2da4856 * rerun * set the loop_time for ec2 to 4 hours * fix ending check * change hang_detection_window * rerun * ignore vulnerabilities caused by pytorch-lightning and pyyaml * Update ignore_ids_safety_scan.json * fix typo * manually remove the ignore of pytorch-lightning vulnerability * add empty lines to files * Remove extra space from buildspec-1-10-syai-1-2.yml * Remove extra spaces * readd pytorch-lightning vulnerability * revert configs for release Co-authored-by: omrialmog Co-authored-by: Shantanu Tripathi * Release autogluon 0.4 (#2054) * Release TF 2.6 (#2055) * Release Habana PT and TF (#2056) * [build][tensorflow]TF 2.8 training for CVE fixes (#2039) * opencv cve fix * Revert changes * remove training images (#2059) * TF2.7 buildspec file change to build training images (#2057) * Check sanity faliures on Pipeline * Build training only * remove TF2.6 example image configuration (#2060) * Release TF2.7 training (#2061) * TF2.7 release * Change customer_type for example * Release Tf2.8 for cve fix (#2062) * Add buildspec-2-6.yml file to HT TF(#2063) * [huggingface_pytorch][NEURON][build] Huggingface Neuron inference DLC Framework update (#1852) * Update buildspec-neuron.yml * new neuron dlc * added dev config * change to neuron sdk 1.18.0 * fixed version * "1.10" * switched to new neuron version * fixed version * Update buildspec-neuron.yml * Update dlc_developer_config.toml * mamba change * Update __init__.py * Update dlc_developer_config.toml * Update dlc_developer_config.toml Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [test] Revert Disable anaconda test (#2042) This reverts commit b0ac1bedc258feea6805a0d5e21ddade723a75cd. * Add HF PT Neuron 1.10 images (#2067) * [huggingface][tensorflow] Removal of repo.anaconda.com for HF TF 2.6 inference (#1998) * Fix image_baseline_size for HF TF 2.6 (#2072) * [pytorch][build][e3] PyTorch 1.12.0 E3 release (#2000) * create framework test images with vanilla pytorch * remove aws-sdk-cpp installation * update mamb conda installation * create cuda 11.6 image for e3 * update training binaries urls * revert cu116 nccl version to 2.10.3 * initialize inference 1.12 * update inference dockerfiles for pt 1.12 * update nvml version for inference cu116 * install dgl through pip * update 3711 and 1292 allow list * instal dgl-cuda11.6 with conda * update to v1.12.0-rc5 binaries * update to v1.12.0-rc7 * unpin opencv-python version & use 1.12 license file * update testTorchdata files * skip other frameworks * update torchdata script * update torchdata script and add new certificate for aws sdk cpp * update EFA to 1.16.0 * remove unnecessary cu113 files * update to official v1.12.0 release * revert EFA version to 1.15.1 * use EFA 1.16.0 * pin safety < 2.0.0 * revert dev config Co-authored-by: Qingzi-Lan * Add HF PT Neuron 1.10.2 to available images markdown (#2070) * [Pytorch][build] update PT1.12 GPU Training image OFI Nccl from 1.2.0 to 1.3.0 (#2076) * update OFI Nccl from 1.2.0 to 1.3.0 * allow CVE-2022-2068 as it's false positive * dummy commit * revert dummy commit * update CVE all mechnism * remove empty line * revert dev config * Add Habana SynAI1.4.1 to available_images.md (#2074) * [release][pytorch] PT1.12 release_image.yml change (#2047) * PT1.12 release_image.yml change * backup PT1.12 release images * update example image * remove SM release_images * PT112 E3 available_images update (#2048) * Update opencv to 4.6.0 for PT 1.10 graviton (#2066) * update opencv to 4.6.0 for PT 1.10 graviton * remove double opencv installation * change opencv update and fix Graviton DLAMI not being selected for Graviton tests * remove 2nd opencv installation * pin safety version * update openssl to 3.0.4 for CVE * revert openssl change * install net-tools to fix route command not found error * revert toml changes Co-authored-by: Kevin Yang * Add test to block specific releases (#2084) Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [build][huggingface_tensorflow] HF TF 2.6 cve fix (#2082) * Opencv cve fix for HF TF 2.6 * Trigger builds * Add rdflib in docker files * Specify rdflib version * Adding rdflib cve to allowlist * Revert file changes * Check sanity failures * Revert changes * [tensorflow] [trcomp] [huggingface] add trcomp 2.6 dockerfile with latest CVE fixes (#2028) * TF2_6_CVE * TF Buildspec * Empty-Commit * Remove bokeh installation * Update huggingface/tensorflow/training/docker/2.6/py3/cu112/Dockerfile.trcomp.gpu Co-authored-by: Loki * Update dlc_developer_config.toml * fix CVEs * update CVE allowlist * update CVE list * Update dlc_developer_config.toml Co-authored-by: dasritwi Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Co-authored-by: Loki Co-authored-by: Qingzi-Lan Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * add triton 22.05 container to docs (#2081) * add triton 22.05 container to docs * order images by latest release Co-authored-by: Rohith Nallamaddi Co-authored-by: Jeetendra Patil * add PT 1.10 graviton to release images (#2089) Co-authored-by: Kevin Yang * Security patch for Autogluon 0.3.2 and bug fix for test cuda path failures (#2078) * release trcomp TF2.6 (#2101) * release trcomp TF2.6 * add arch type * Re-release of PT1.11 for Sagemaker model parallel v1.10.0 (#2041) * Add HC support to SageMaker DLC (#2099) Co-authored-by: Jeetendra Patil Co-authored-by: Karan Jariwala Co-authored-by: Lai Wei Co-authored-by: Akhil Mehra Co-authored-by: yselivonchyk Co-authored-by: akhilmehra Co-authored-by: Rajan Singh Co-authored-by: omrialmog Co-authored-by: omrialmog Co-authored-by: Alexander Shirkov <10080307+gradientsky@users.noreply.github.com> Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> Co-authored-by: Sergey Togulev Co-authored-by: Alexander Shirkov Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Co-authored-by: pinaraws <47152339+pinaraws@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri Co-authored-by: David Huang Co-authored-by: Shantanu Tripathi Co-authored-by: David Huang Co-authored-by: Loki Co-authored-by: aws-vrnatham <73142315+aws-vrnatham@users.noreply.github.com> Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan Co-authored-by: Harish Tummalacherla Co-authored-by: Ubuntu Co-authored-by: Junpu Fan Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com> Co-authored-by: Venky Natham Co-authored-by: Manu Seth <22492939+mseth10@users.noreply.github.com> Co-authored-by: Wei Chu Co-authored-by: Judy Heflin Co-authored-by: Hongshan Li Co-authored-by: Shantanu Tripathi Co-authored-by: Buke Ao Co-authored-by: Anny Chung Co-authored-by: waytrue17 <52505574+waytrue17@users.noreply.github.com> Co-authored-by: Rahul Huilgol Co-authored-by: Daiming Yang <66369380+ydaiming@users.noreply.github.com> Co-authored-by: lxning <23464292+lxning@users.noreply.github.com> Co-authored-by: kevinyang8 <40340762+kevinyang8@users.noreply.github.com> Co-authored-by: Kevin Yang Co-authored-by: Zhongkai Zhang <74077786+zzhang37@users.noreply.github.com> Co-authored-by: Fei <33940270+YangFei1990@users.noreply.github.com> Co-authored-by: haohanchen-yagao Co-authored-by: Judy Heflin Co-authored-by: Zeeshan Ashraf Co-authored-by: Nishanth Hegde Co-authored-by: RadhikaB-97 Co-authored-by: Radhika Bhat <78102284+RadhikaB-97@users.noreply.github.com> Co-authored-by: matherit <97054789+matherit@users.noreply.github.com> Co-authored-by: Xin Yang <105740670+xyang16@users.noreply.github.com> Co-authored-by: Mike Schneider <104035434+xncqr@users.noreply.github.com> Co-authored-by: Nishanth Hegde Co-authored-by: Daiming Yang Co-authored-by: Satish Pasumarthi <35979860+satishpasumarthi@users.noreply.github.com> Co-authored-by: Satish Pasumarthi Co-authored-by: Kenneth Ezirim Co-authored-by: Buke Ao * [pytorch][build] Rename PT 1.11 buildspec (#2102) * Huggingface sagemaker test fix (#2098) * Huggingface sagemaker test fix * adding specification to empty report generating condition * Sagemaker local test skip revamp * Add TODO to the comment in test_placeholder.py * Update test_placeholder.py * Enabaling sm local tests * Revert changes * Add condition to skip sm-local * Revert changes Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * [pytorch][build][sagemaker] PyTorch 1.12.0 SageMaker release (#2002) * create framework test images with vanilla pytorch * remove aws-sdk-cpp installation * update mamb conda installation * build test images for sagemaker * update smdebug_version to 1.0.18 * update training SM binary urls * try build SM iamges only * use latest dgl * update to v1.12.0-rc5 binaries * update v1.12.0-rc7 * update opencv and license * skip other frameworks * update torchdata script and add new certificate for aws sdk cpp * update to v1.12.0 binaries * update inference e3 v1.12.0 wheel * run standard test and disable datetime_tag * update SMD Data Parallel wheel link * do_build=false; sagemaker rc * sagemaker efa + local * skip fastai tests * Revert "update SMD Data Parallel wheel link" This reverts commit 03ab548b7e7b11bc07bd7b5a17d5ceb715a15737. * update SMD Data Parallel wheel link * udpate OFI to 1.3.0-aws * do build; sagemaker standard * sagemaker rc tests * sagemaker efa + local * update PyTorch training binaries without ZCC * temporarily disable Train GPU build, because metis isn't available * remove metis installation and build SageMaker Train GPU * udpate smdebug version to 1.0.19 * build false; sagemaker standard * sagemaker rc * build false; sagemaker standard * Smp pt 1 12 (#63) * SMP pt1.12 * minor fix * Update test/sagemaker_tests/pytorch/training/resources/gpt2/memory_tracker.py Co-authored-by: Daiming Yang <66369380+ydaiming@users.noreply.github.com> * skip tst_sm_profiler_pt test * sagemaker rc * do_build = true; sagemaker standard * update sm aws-pytorch binary * use e3 binary for E3 build in training cpu * build = false; sagemaker rc * sagemaker efa * move net-tools installation to Graviton image only * update binary links * do_build = true; sagemaker standard * update inference GPU binary url * Trigger Build * Trigger Build * Trigger Build * do_build false; sagemaker efa * Trigger Build * Trigger Build * revert dlc_developer_config.toml Co-authored-by: haohanchen-yagao <54413235+haohanchen-yagao@users.noreply.github.com> Co-authored-by: Qingzi-Lan * Re-release HF TF 2.6 (#2104) * Release HF TF 2.6 for opencv cve fix and ananaconda removal * Set example to false * Security Patch for Autogluon 0.3.2 - install OSS compliance package (#2097) * Mdified release images file for autogluon 0.3.2 release (#2108) * [pytorch][build] fix dgl test (#2107) * fix dgl test * update dev config * fix typo * update accuracy threshold * update dev config, build without datetime * aggregate test file, do build = false * remove the new test file * update mxnet test script * revert dev config Co-authored-by: Qingzi-Lan Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * Bukeao habana dlc 1.5.0 release (#2069) * copy files * build success version * pycocotools, enable habana mode and benchmark, and AMI id * datetime_tag=false * empty commit, rerun * Add backward compatiblity for test_pre_release version check * Enable benchmark testing * fix tfversion for license.txt * add old 1.4 buildspec.yml & add all history DLAMI id * rerun * rerun * update pt version check * only compare major version * revert config toml Co-authored-by: omrialmog Co-authored-by: Omri Almog Co-authored-by: Shantanu Tripathi * allow hpu/openssl for cve 3711 (#2111) Co-authored-by: omrialmog * [pytorch][test] update 0.9.x GCN SM test (#2112) * test with 0.9.x CGN test * update test file for DGL lower than 0.9.x * fix typo * update test script name to match DGL upstream * revert dev config * Re-release PT1.11 and PT1.10 training images with latest sagemaker-pytorch-training toolkit (#2114) * Re-release PT1.11 and PT1.10 training images with latest sagemaker-python-training toolkit * Update dlc_developer_config.toml * Update dlc_developer_config.toml * Update buildspec-1-11.yml * Update dlc_developer_config.toml * Update dlc_developer_config.toml * Release Habana Synai1.5.0 images (#2113) * Release Habana TF2.9.1 Synai1.5.0 images * Update release file for pytorch as well * Update HF TrComp PT 1.10 build to prepare for anaconda removal (#2092) * dummy change to trigger builds and tests * remove numpy pin and use latest numpy * pin numpy to 1.22.4 * skip test_binary_visibility for HF TrComp images * update tr comp image function * revert toml file Co-authored-by: Kevin Yang * [pytorch] [test] update testTorchdata bash script (#2115) * update testTorchdata bash script * remove the redundant succeeding message * Remove Pt1.11 CPU GPU from release_images.yml (#2116) * Security Patch of opencv-python for PyTorch 1.10 (#2093) * added opencv-python security patch for pytorch 1.10 * modified the developer config file to skip all frameworks except pytorch * Update Dockerfile.sagemaker.gpu * remove metis for PT1.10 * Check EC2 connection before changing path * Removed the additional print statement * Specify buildspec for pytorch 1.10 using new approach * Fixed error in code with the deletion of exception handler * SMP backward compatibility * add smp version check * minor fix * pin sagemaker training version * Update buildspec-1-10.yml * fix issue for smp init * Restoring dlc developer configurations Co-authored-by: haohanchen-yagao * Added release image template for PT 1.10 (#2119) * Added release image template for PT 1.10 * Update release_images.yml * Update release_images.yml Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * update release image file (#2103) * Add Habana SynAI1.5 images to available_images.yml (#2118) * modifying PT 1.10 buildspec to generate inference image RCs (#2123) * PT112 SM available_image update (#2049) Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * Release PT 1.11 sagemaker training images (#2126) * releasing pytorch 1.10 inference images (#2128) * Huggingface Pytorch 1.10 cve fix (#2125) * Add Rdflib in ignore_ids_safety_scan.json file * Changes to docker file to trigger builds * Revert Changes Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> * Add CGK region to the available_images.md file (#2127) * Release HF PT 1.10.2 inference and training (#2132) * Remove out of support images from available_images.md (#2129) * remove out of support images * remove additional images * remove EI and old habana images * remove EI Co-authored-by: Kevin Yang * Update available images file with anaconda license (#2137) * update release images for Tr Comp (#2138) Co-authored-by: Kevin Yang * [Pytorch][build] Update SMMP and ofi-nccl (#2136) * Update SMMP and ofi-nccl * revert devconfig and buildspec Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan * Nightly DLC Test Parametrization (#2090) * [autogluon][build] AutoGluon 0.4.3 container (#2139) * dlc_developer_config.toml * Updated AutoGluon container files - AG 0.4.3 * updated models * Revert "dlc_developer_config.toml" This reverts commit f302739b289349b5a99fc4780c28f51577df489f. * updated container sizes Co-authored-by: kevinyang8 <40340762+kevinyang8@users.noreply.github.com> * add AG 0.4.3 to release images (#2146) Co-authored-by: Kevin Yang * update license to include anaconda notice (#2147) Co-authored-by: Kevin Yang * update AG 0.4.2 to AG 0.4.3 in available_images (#2150) Co-authored-by: Kevin Yang * [autogluon][build] AutoGluon 0.5.2 container (#2110) * Updated AutoGluon container files - AG 0.5.1 * dlc_developer_config.toml * Added OpenSSL exceptions * Revert "dlc_developer_config.toml" This reverts commit e3906f525a9bc692b21d480859c22f4b63b2cd5e. * Updated AutoGluon container files - AG 0.5.2 * dlc_developer_config.toml * Updated image sizes * Updated AutoGluon container files - AG 0.5.2 * Revert "dlc_developer_config.toml" This reverts commit 350458dc21963a58cd414b93319ee64a1d1638d7. * Trigger build * add AG 0.5.2 to release images (#2152) Co-authored-by: Kevin Yang * Update available_images.md with AG 0.5.2 (#2153) Co-authored-by: Kevin Yang * [tensorflow][test][sagemaker] Swap order on json_request to prevant errant processing. (#2091) * Swap order on json_request to prevant errant processing. * Temp config change. * Temp change only build inference images. * Toml add sagemaker remote tests. * Fix test. * Revert "Temp change only build inference images." This reverts commit d82e949d0ec09ea958ed2037c0ef11b8050fde31. * Revert "Toml add sagemaker remote tests." This reverts commit a6d51d1c2dba21ecb64f96bac0e7d6df2e5dc5b9. * Revert "Temp config change." This reverts commit acdb3bbb94d654bbef1988a0046ee6f2f420df48. * Manually resolve conflict to align with upstream branch. Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> * Correct nightly fixtures on SM tests (#2164) * Update e3 name to ec2 in dockerfiles and build/test code (#2140) * Update e3 name to ec2 in dockerfiles and build/test code * update file names and comments * Update PT 1.11 dockerfile * address nit comment * Replace banners, replace release images yml * SMP PT1.12 re-release (#2145) * add zero-2d for safety check * add utl * update dev config and allow 48298 * update SMMP 1.11 Binary && Add SMMP GPT2 SDP test * add comment * enable efa test * disable build * update smmp binary * minor fix * add multinode sdp test for smmp * minor fiix * check efa/non-efa test * remove smmp sdp p3 test * nit * enable rc and local test * Add comment for skipped check * pin sagemaker version * disable build * run SM local test with fix, disable sm remote test * revert temporary change * minor fix * revert change in dev config Co-authored-by: Qingzi-Lan Co-authored-by: haohanchen-yagao Co-authored-by: haohanchen-yagao <54413235+haohanchen-yagao@users.noreply.github.com> Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * [pytorch][sagemaker] Add tests for pytorchddp distribution (#2141) * Add remote integration test for pytorchddp distribution * Update dlc_developer_config * Retrigger CI tests with no-op commit * Update buildspec for PT 1.11 * Fix whitespace and args for test * Update buildspec for PT 1.12 * Add efa marker for gpu test * Retrigger CI * skip frameworks other than pytorch for build/test * set datetime_flag to false * set do_build=false * Revert changes to dlc_developer_config.toml * Small fixes * Run tests for 1.11 again * Clean up PR for merge * Revert buildspec.yml changes * Update instance type for CPU test * Fix CPU test * build PT1.11 images without datetime tag * build PT1.12 images without datetime tag * Remove CPU test * Revert changes to dlc_developer_config Co-authored-by: tejaschumbalkar Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * disable SM inference build for PT1.12 (#2168) * [pytorch][graviton][build] PT 1.12 graviton release (#2109) * draft PT 1.12 graviton release * turn on graviton mode * update for PT 1.12 specific changes * update buildspec path * update wheels for PT 1.12.1 and torchdata and torchtext * update some package versions * update package versions & gcc 10 support * Change Dockerfile name from e3 to ec2 * Change e3 to ec2 in dockerfile * update version from 1.12.0 to 1.12.1 * update pytorch binary link with 1.12.1+cpu tag * Revert the dev config changes * Changes to buildpsec-graviton-1.10 file Co-authored-by: Mike Schneider <104035434+xncqr@users.noreply.github.com> Co-authored-by: Radhika Bhat <78102284+RadhikaB-97@users.noreply.github.com> * [Tensorflow][build] update Protobuf version to 1.15.0 (#2159) * update Protobuf version to 1.15.0 * update tensorflow-serving version * correct TF version * merge master branch in * modify buildspec * set do_build to false * set do_build to true * revert dev config * release training image (#2170) * [pytorch] | [sagemaker] SMDDP PT1.12 re-release (#2165) * SMDDP PT1.12 re-release * Revert dev config * [tensorflow][build]Re-release TF 2.8 inference DLC with cve fix (#2158) * Protobuf cve fix * Change buildspec file * Buildspec change * Set Datetimetag to false * Buildspec change * Pinning the protobuf to 3.15.0 * Revert changes to merge * [TensorFlow][Inference]Multi-gpu support for TF inference (#2149) * multi gpu support for TF inference * refactor helper function * update buildspec * SM remote test rc * debug code * revert debug code * sm remote test efa * revert dev config and buildspec * re-run efa test * re-run efa test * revert buildspec and dev config * fix typo Co-authored-by: Wei Chu Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * Update opencv to 4.6.0 for PT 1.10 Neuron sdk 1.17.0 (#2065) * update opencv to 4.6.0 for PT 1.10 Neuron sdk 1.17.1 * add dummy change to buildspec to trigger build * pin safety and unset LD_LIBRARY_PATH * Updated LD_LIBRARY_PATH to resolve relocate error * added more vulnerability ids to ignore list * moved OS version to image config section * updating default neuron buildspec with changes to OS version placement * restoring developer configuration file Co-authored-by: Kevin Yang Co-authored-by: Radhika Bhat <78102284+RadhikaB-97@users.noreply.github.com> Co-authored-by: Kenneth Ezirim <105572535+kenny-ezirim@users.noreply.github.com> Co-authored-by: Kenneth Ezirim * Update opencv to 4.6.0 for PT 1.10 Neuron sdk 1.19.0 (#2064) * update opencv to 4.6.0 for PT 1.10 Neuron sdk 1.19.0 * change buildspec instead of using toml to specify * pin safety version and unset LD_LIBRARY_PATH * attempt using UL20 instead of UL18 * revert from UL20 and try installing older package * install libattr1 * install specific libattr1 version * try to install older libattr1 * fix LD_LIBRARY_PATH * added more vulnerability ids to ignore list * moved OS version placement in buildspec * restoring developer configuration file Co-authored-by: Kevin Yang Co-authored-by: Radhika Bhat <78102284+RadhikaB-97@users.noreply.github.com> Co-authored-by: Kenneth Ezirim <105572535+kenny-ezirim@users.noreply.github.com> Co-authored-by: Kenneth Ezirim * [build][pytorch] PT 1.12 buildspec file changes (#2178) * Change buildspec file to add cu116 * Comment base_image_name * [build][test] Add logic to run tests for SM Lite and SM Full DLCs in PR (#2161) * [build][pytorch] PT 1.9 sagemaker-pytorch-inference version upgrade (#2171) * sagemaker-pytorch-inference version upgrade * Enable sm local tests * Un-pinning the sagemaker-pytorch-inference * Pinning sagemaker-pytorch-inference to 2.0.8 * Update Dockerfile.cpu * Update Dockerfile.gpu * Remove repo.anaconda.com * Changes to docker file for anaconda removal * Revert config file changes * Revert changes Co-authored-by: Manu Seth <22492939+mseth10@users.noreply.github.com> * Re-release TF 2.7, TF 2.8. TF 2.9 and PT 1.11 (#2183) * Re-release TF 2.7 TF 2.8 and PT 1.11 * Fix indentation * Set example to false and cuda_version to 11.3 * Typo correction * Set disable_sm_tag to false * Add cuda_version * Add Tf 2.9 * Update available images and canaries with ec2 name change (#2174) * [Triton] | [Documentation] | Update Triton DLC v22.07 (#2187) * [Pytorch][Graviton][Build][Sagemaker] PT 1.12 Graviton SageMaker Release (#2172) * Adding SM to PT112 Graviton Framework * standardizing pt graviton buildspecs * PR 1.12 is default graviton buildspec * remove SM context. not needed. * seeing if the SM config is breaking graviton build * found the issue. * added SM TS entrypoint * hoping to add missing label * setting benchmark * seeing about a label * fixing os version tag * createing a build for testing operations. * done with benchmark tests * verify green PT1.12-Graviton * revert toml. ready for review * resolve pr comments * fixed issue with conda install failing due to invalid path (#2190) * fixed issue with conda install failing due to invalid path * restoring developer config before merge * Re-release PT 1.9 (#2192) * Minor change in release file (#2193) * [Tensorflow][release] re-release tf27 (#2173) * re-release tf27 * add TF28 images * [TensorFlow][Graviton][Build][Sagemaker] TF 2.9 Graviton Release (#2169) * Added TF2.9-Graviton with SM * update toml for testing * fixed dockerfile path for tf-graviton buildspec * added sagemaker context in buildspec * conda init and activate for tensorflow test * refactor buildspec and use pip3 * TF 2.9 is default graviton buildspec * pip to pip3 * working the TF testing for inference * another pip try * doing pip fix again. * troubshooting TF testing * removing the custom install. think not needed * fixing test only * toml fix * running benchmark * little cleanup on tensorflow testing * more test tuning * attempting tests only * using tensorflow-cpu-aws for graviton test * remove test pinning * adding emulation tests for graviton SM * verify green for TF-Graviton * revert toml. ready for review * cleaning a comment * Resolve PR questions * added ec2 target for graviton buildsped * verify build test * fix toml typo * revert toml * update release_image.yml, remove duplicate (#2195) * deep-learning-containers: Release Neuron SDK 1.19.0 (#2194) * updating release_images.yml for Neuron release * update release_images.yml for Neuron SDK 1.19.0 release * [trcomp] [huggingface_pytorch] [build] Adding support for PyTorch 1.11 (#2032) * Add support for PT 1.11 * Update huggingface/pytorch/training/docker/1.11/py3/cu113/Dockerfile.trcomp.gpu Co-authored-by: Loki * Update huggingface/pytorch/training/docker/1.11/py3/cu113/Dockerfile.trcomp.gpu Co-authored-by: Loki * Update huggingface/pytorch/training/docker/1.11/py3/cu113/Dockerfile.trcomp.gpu Co-authored-by: Loki * Base Image Change * Skip frameworks * Skip frameworks fix * set trcomp mode * empty commit * Update with updated wheel * upgrade nccl to 2.12.10 and aws-ofi-nccl to 1.4.0 * Moving tranformers version to 4.21.1 * Retry PR tests * Reverting changes to Base PT container * add SNMG,MNMG to TRCOMP integration test * fix integration test issues and typos * add hyper-parameters for distributed training * add fixture for instance_count and instance_type * fix: updating PT binaries built with NCCL=2.10.3 * fix: updating transformers binary path for trcomp-hf-pt-1.11 * fix: updating transformers binary path for trcomp-hf-pt-1.11 * Fixing integration tests * fix: updating integration tests to not set stale environment variables from previous versions * fix: broken fixtures in integration tests * fix: broken fixtures in integration tests * fix: broken fixtures in integration tests * fix: Downgrade numpy version for numba * fix: updating training toolkit * fix: ignoring CVE in rfdlib * fix: ignoring CVE in rfdlib * fix: numpy version dependency issue * documentation: explaning pytorch_xla DT workaround * fix: json formatting error * fix: more assertions for multi node integration tests to check for cross node communication * fix: broken numba installation * Rebuilding with new PT binaries built with numpy==1.22.2 * fix: skipping MPI tests for this DLC * Build image without datetime tag * Running RC tests * Running EFA tests * upgrade NCCL to 2.12.12 * rename build spec file * revert dlc developer config Co-authored-by: dasritwi Co-authored-by: Loki Co-authored-by: Dingheng (Bruce) Zhang Co-authored-by: Harish Tummalacherla Co-authored-by: Loki Co-authored-by: Bruce Zhang * [tensorflow][pytorch][graviton][test] Sanity test fix (#2196) * revert test-buidspec for arch. * set toml to verify tests * typo.. * TF Graviton is good rerunning PT * Cleaning PT112 cache for security check * tests complete. revert toml * [pytorch][tensorflow][graviton][build] temp disable sm-graviton (#2198) * temp disable sm-graviton * tests passed. reverting toml * [Tensorflow][Pytorch][Graviton][release] PT1.12 and TF2.9 graviton images release (#2199) * Release PT1.12 and TF2.9 Graviton images * newline on file * set sm_tag to false * fixing type and version in release_images (#2200) * fixing type and version in release_images * version is correct. * [pytorch][tensorflow][graviton][release] graviton release fix (#2203) * fixing type and version in release_images * version is correct. * release fix for graviton images * [pytorch][tensorflow][graviton][cleanup] remove release_image entries for graviton (#2204) * fixing type and version in release_images * version is correct. * release fix for graviton images * removing Graviton entries from release_images * Check Sanity Tests for TrComp Images (#2202) * Check Sanity Tests for TrComp Images * Random change in buildpsec * Add HF-pt-trcomp for PT1.11 to the allowlist * Remove -q from the docker pull command * Skip ec2 nccl test * Revert temp changes * Run Benchmark tests and other tests for fixing trcomp mainline failures (#2205) * Run Performance tests and other tests for fixing trcomp mainline failures * Add to allowlist in dependency check * Run SM local and remote tests * Turn benchmark to false * Remove -q from the image pulls * Revert temp changes * Add new graviton images to available_images (#2206) * [Pytorch] | [build] | [test] | re-release PT 1.12 SM DLCs (#2201) * re-release PT 1.12 SM DLCs * fix buildspec * run efa * Revert toml file to original state * Release PT-Trcomp images (#2207) * [Tensorflow][build][graviton] update protobuf to 3.15.0 (#2160) * update protobuf to 1.15.0 * update test * update graviton host setup * using pip3 in graviton images * use base conda env * conda init bash first * test using pip3 * install pip in OS level * do apt-get update * do build again * revert dev config * update build spec * rename build spec file * set dev config * revert dev config * Release file update for TF 2.8 (#2209) * Python version update for TF 2.8 * Clean up * re-release PT1.12 sm training images (#2210) * re-release TF2.7 graviton cpu inference (#2214) * Add trcomp images to available_images.yml (#2211) * [pytorch][eia][build]Handling Pytorch 1.5.1 eia vulnerabilities (#2179) * Handling Pytorch 1.5.1 eia vulnerabilities * Setting parameters for test * Modify wheel version * Disable tests * Enable local and remote tests * Set dlc-pr-pytorch-eia * Specify python version * Move dockerfile to py38 folder * Move to Mambaforge * Upgrade ubuntu base and MMS * Prevent stopping by user interaction * Correct ubuntu tag to 20.04 * Modify buildspec-eia.yml instead * Set the right buildspec path * Fix the format of sagemaker eia test * Temporarily skip major version check * Update dlc_developer_config.toml * Update dlc_developer_config.toml Co-authored-by: Radhika Bhat <78102284+RadhikaB-97@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * EIA pt-1.5.1 (#2218) * [pytorch][build][ec2] PyTorch 1.12.1 EC2 release (#2189) * feat: create buildspec for 1.12.1 * feat: create initial buildspec for 1.12.1 e3 * feat: renew the whl urls for 1.12.1 release * refactor: remove the 1-12-1 legacy buildspec * revert useless cu113 for e3 * Revert "revert useless cu113 for e3" This reverts commit 946e08719e11d991f7589a10642a0d747fbdeb1e. * feat: add cuda 11.3 image builds for E3 * fix: replace the RC urls with the release ones * fix: replace the RC urls with the release ones * refactor: cancel cu113 build for e3 * feat: replace URLs with built binaries * fix: use RC url for torchdata, official one is invalid * feat: fill in the built whl urls (with normal format) * revert changes in cu113 Dockerfile for e3 PR * style: remove spaces * feat: fill in new ec2 whls * fix: replace e3 with ec2 in the buildspec * test: update the dgl branch to fix the data.graph deprecation issue * test: use the system dgl version as branch info for example test * Revert "test: use the system dgl version as branch info for example test" This reverts commit 63017e45234323105e8555b30fa30b5dd57b9cb6. * test: reset it to 0.8.x since 0.9.x dgl's train.py arguments changed * refactor: change sm training whls to aws * test: auto-adapt the dgl version through bash * test: remove the gpu argument for the new dgl * refactor: revert training sm url changes * refactor: remove the dgl submodule Co-authored-by: Shibo Xing * add eia container to avaiable_image list (#2222) Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> * Add ec2 dockerfiles for Tensorflow 2.10 (#2163) * Copying tensorflow training 2.9 to 2.10 directory * Update tf210 e3 cpu Dockerfile * Update tf210 e3 gpu Dockerfile * Update tf210 buildspec for ec2 training * Skipping non-tensorflow framework testing * Updating TF 2.10 Dockerfile e3 naming to ec2 * Force buildspec short version 2.10 to be a string * Comment out sagemaker tensorflow buildspec blocks * Improving version check for TF2 standalone test * Update image recipe ascii art from e3 to ec2 * Updating OFI version to 1.4.0-aws * Updating EFA version to 1.17.2 * Run apt-get update for EFA install * Updating EC2 wheels for TF 2.10 RC1 * Updating TF 2.10 EC2 wheels for RC2 * Updating TF 2.10 EC2 wheels for RC3 * Updating TF 2.10 EC2 wheels for smdebug/smprofile * Updating TF 2.10 EC2 wheels for Hopper * Updating TF 2.10 EC2 wheels for GA release * Added OpenSSL exceptions for TF 2.10 * Revert "Skipping non-tensorflow framework testing" This reverts commit 29cefbb45d6b7bc9160c437e95601988e6d08b14. Co-authored-by: kevinyang8 <40340762+kevinyang8@users.noreply.github.com> * [pytorch][build][sagemaker] PyTorch 1.12.1 SageMaker release (#2188) * feat: create buildspec for 1.12.1 * feat: renew the whl urls for 1.12.1 release * fix: replace the test torch whls with normal whls * refactor: remove the 1.12.1 legacy buildspec * revert the useless cu116 changes for sm * feat: fill in the built binary urls * feat: fill in the built binary urls (with normal format) * feat: add in the new herring whl * feat: fill in the new ec2 whls * Update dlc_developer_config.toml * feat: fill Rubik re-compiled whl * feat: run security tests * test: turned off remote sm tests * Revert "test: turned off remote sm tests" This reverts commit ee9a91f4bbf872ab72206fb89bf16e764f215ffa. * Revert "feat: run security tests" This reverts commit 1e38d3a31c9f1c18a839b23a4f2ec06a514eea31. * Revert "Update dlc_developer_config.toml" This reverts commit 4fd59cff241ddd4ddaebc91e04b6281b2f51857e. * refactor: use aws url for ec2 training urls * test: revert the ec2 whl url changes * refactor: remove the dgl submodule * feat: update ec2 train whl, smdebug versions * test: turn on all tests * perf: add frameworks to be skipped * test: remove skip_frameworks, for profiler tests need tensorflow * revert changes to buildspec and dev configs * revert changes in dev configs * uncomment sm configs * refactor: uncomment pytorch examples gpu Co-authored-by: Shibo Xing Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> * Add Tensorflow Serving dockerfiles for Tensorflow 2.10 (#2182) * Skipping non-tensorflow framework testing * Copying 2.9 folder to 2.10 for TF 2.10 release * Tensorflow Serving update for TF 2.10 release * Fixing tensorflow-serving-api version number * Increase image baseline size for EC2 GPU Inference * Updating TFS 2.10 wheels for RC2 * Updating TFS 2.10 wheels for RC3 * Adding buildspec file for TF 2.9 * Updating TFS 2.10 wheels for GA * Wait for apt-get to finish before installing pip * Revert "Skipping non-tensorflow framework testing" This reverts commit 2b2503e5f1f48fa9e73dd48086bcf22b902fb7a5. * Add TF 2.10 EC2 to release images (#2228) * add TF 2.10 EC2 to release images * add example image * disable_sm_tag set to false Co-authored-by: Kevin Yang * fix variable name (#2230) Co-authored-by: Kevin Yang * habana dlc 1.6.0 release (#2175) * first build success * disable datatime_tag and enable habana mode * enable benchmark mode * revert temp changes Co-authored-by: omrialmog Co-authored-by: hballuru <113142824+hballuru@users.noreply.github.com> Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> * Bump tensorflow from 2.5.3 to 2.7.2 in /test/sagemaker_tests/huggingface_tensorflow/training (#1938) * Bump tensorflow in /test/sagemaker_tests/huggingface_tensorflow/training Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.5.3 to 2.7.2. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.5.3...v2.7.2) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Allow SM integ tests to run * Revert temporary changes Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Tejas Chumbalkar <34728580+tejaschumbalkar@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri * remove example cpu image (#2231) Co-authored-by: Kevin Yang * Create separate inference buildspec for PT 1.12 (#2232) * Create separate inference buildspec for PT 1.12 * Update buildspec-1-12-inference.yml * update available_images with TF 2.10 EC2 release (#2233) Co-authored-by: Kevin Yang * Add nightly release docker files (#2166) * [Pytorch] Release for PT1.12.1 (#2226) * [pytorch][build][sagemaker] PyTorch 1.11.0 SM training re-release, fixing perf degradation (#2220) * feat: renew the sm training whls (with perf issue fix) * feat: adjust the buildspec * fix: uncomment the ec2 for base image * test: comment out ec2 section in buildspec * feat: adjust smdebug to 1.0.22 * test: disable profiler test for >1.10 * rever the buildspec changes * feat: adjust buildspec, smdebug for 1.11 * feat: turn on the sm tests, over buildspec with 1.11 * Revert "feat: adjust smdebug to 1.0.22" This reverts commit b4e13a0f2b024060bba8126cc043d7e0f438ec3d. * test: disable irrelevant builds * feat: renew the herring whl * feat: turn on the security checks * feat: renew Rubik's whl * fix: add missing raise for wrong local path * fix: profiler skipping message * test: turn on SM rc tests * test: run standard SM test * revert the developer config changes * revert 1.11 buildspec changes Co-authored-by: Shibo Xing * Update the available_image.md for 1.12.1 release (#2237) * [Pytorch] Re-Release Images Update for PT1.11 Re-Release (#2238) * [pytorch][build][sagemaker] PyTorch 1.10.2 SM training re-release, fixing smdebug perf degradation (#2219) * Update release_images for 1.10.2 re-release (#2241) * Updated PT1.10 to Build SM Inference (#2245) * Updated build for inference * Updated Conda forge * Additional fixes for inference dockerfile * Additional updates to inference image * Updated GPU conda updates * Update Dockerfile.ec2.cpu * Reverted Config * Reverted Buildspec Co-authored-by: Ohad Katz * Added update to pip list that fails due to pip versioning outputting to stdout (#2256) Co-authored-by: Ohad Katz * [test] Add ECR Enhanced Scanning Tests (#2130) * Add ECR Enhanced Scanning Tests * do_build false * Rebuild images * Add test_ecr_enhanced_scan * Try rebuilding images * Temp change to build images * Upload image * Add functions to read ecr enhanced data * Fix builds * Add pagination based calls * Add the wait function for scans to complete * Minor fix * Created ECRBasicScanVulnerabilityList class * Make minor fix to basic scan and add enhanced scan * Added ecr scan format processor to conver ecr scan results to allowlist format * Add tests for checking the functionality of ScanVulnerabilityList * Add subtraction tests for the ScanVulnerabilityList * Add tests to confirm that the allowlist is being read properly * Added tests for add and __cmp__ operator check * Resolved Undefined Score issues * Adding allowlist feature and fixing sorting issues * Add try except in test_ecr_scan and a print in test_dlc_labels * Run formatter tool on test_ecr_scan and security * Rebuild images * Added error handling for ScanNotFound Exception * Format and add comments to __init__.py * Formatted ecr.py and added comments * Added minor comment on ecr.py * Fix key issue in case there are no vulnerabilities for an image * Rerun to see if the tests fail this time * Add buffer after wait for enhanced scans * Do not build images * Print the dict in string format * Rebuild images * Print remaining_vulnerabilities in JSON format * Do not build images * Print the vulnerability list as str * Rebuild images and show JSON * Take test_security out of test_ecr_scan and increase the enhanced scanning timeout * Added object based usage of AllowListFormatVulnerabilityForEnhancedScan (WIP) * Rebuild image and derialize to json object in assert statement * Make changes in one commit * Use EnhancedJSONEncoder class to handle datetime objects as well * Respond to review comments - 1 * Rebuild images with ec2 tags * Responding to review comments - 2 * Added __eq__ and __ne__ operators * Log… * [NEURON][PT][SM][TEST] - Add simple single node neuron test (#489) * Update release_images.yml For hf neuron for the time being have disable_sm_tag to True * Update release_images.yml * [NEURON][PT][SM][TEST]: Add single node SM test Signed-off-by: Venky Natham * fix ecr ecan for neuron/eia * revert temp config Signed-off-by: Venky Natham Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: tejaschumbalkar * [NEURON][PT][TRAIN] - Release the first version of neuron sdk2.3.0 (#493) * Update release_images.yml For hf neuron for the time being have disable_sm_tag to True * Update release_images.yml * Use public repo Signed-off-by: Venky Natham * Enable neuron test Signed-off-by: Venky Natham * Fix the buildspec to use the neuron sdk tag Signed-off-by: Venky Natham * Fix the torch-neuron version. Only torchx-neuron has the version 1.11.0.1.1.1.. Sanity test is checking for torch-neuron Signed-off-by: Venky Natham * Ignore failure due to safety package Signed-off-by: Venky Natham * revert temp config Signed-off-by: Venky Natham Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: tejaschumbalkar * release PT1.11 neuron training DLC (#494) * release PT1.11 neuron training DLC * set disable_sm_tag to True * set the tag in correct item * nit change * run neuron * Ignore 51457 vulnerability Signed-off-by: Venky Natham * nit change * remove temp condition * revert temp config Signed-off-by: Venky Natham Signed-off-by: dependabot[bot] Co-authored-by: Jeetendra Patil Co-authored-by: Karan Jariwala Co-authored-by: Lai Wei Co-authored-by: Akhil Mehra Co-authored-by: yselivonchyk Co-authored-by: akhilmehra Co-authored-by: Rajan Singh Co-authored-by: omrialmog Co-authored-by: omrialmog Co-authored-by: Alexander Shirkov <10080307+gradientsky@users.noreply.github.com> Co-authored-by: Sergey Togulev <34056697+SergTogul@users.noreply.github.com> Co-authored-by: Sergey Togulev Co-authored-by: Alexander Shirkov Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri <54188298+saimidu@users.noreply.github.com> Co-authored-by: pinaraws <47152339+pinaraws@users.noreply.github.com> Co-authored-by: Sai Parthasarathy Miduthuri Co-authored-by: David Huang Co-authored-by: Shantanu Tripathi Co-authored-by: David Huang Co-authored-by: Loki Co-authored-by: aws-vrnatham <73142315+aws-vrnatham@users.noreply.github.com> Co-authored-by: Qingzi-Lan <83724147+Qingzi-Lan@users.noreply.github.com> Co-authored-by: Qingzi-Lan Co-authored-by: Harish Tummalacherla Co-authored-by: Ubuntu Co-authored-by: Junpu Fan Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com> Co-authored-by: Venky Natham Co-authored-by: Manu Seth <22492939+mseth10@users.noreply.github.com> Co-authored-by: Wei Chu Co-authored-by: Judy Heflin Co-authored-by: Hongshan Li Co-authored-by: Shantanu Tripathi Co-authored-by: Buke Ao Co-authored-by: Anny Chung Co-authored-by: waytrue17 <52505574+waytrue17@users.noreply.github.com> Co-authored-by: Rahul Huilgol Co-authored-by: Daiming Yang <66369380+ydaiming@users.noreply.github.com> Co-authored-by: lxning <23464292+lxning@users.noreply.github.com> Co-authored-by: kevinyang8 <40340762+kevinyang8@users.noreply.github.com> Co-authored-by: Kevin Yang Co-authored-by: Zhongkai Zhang <74077786+zzhang37@users.noreply.github.com> Co-authored-by: Fei <33940270+YangFei1990@users.noreply.github.com> Co-authored-by: haohanchen-yagao Co-authored-by: Judy Heflin Co-authored-by: Zeeshan Ashraf Co-authored-by: Nishanth Hegde Co-authored-by: RadhikaB-97 Co-authored-by: Radhika Bhat <78102284+RadhikaB-97@users.noreply.github.com> Co-authored-by: matherit <97054789+matherit@users.noreply.github.com> Co-authored-by: Xin Yang <105740670+xyang16@users.noreply.github.com> Co-authored-by: Mike Schneider <104035434+xncqr@users.noreply.github.com> Co-authored-by: Nishanth Hegde Co-authored-by: Daiming Yang Co-authored-by: Satish Pasumarthi <35979860+satishpasumarthi@users.noreply.github.com> Co-authored-by: Satish Pasumarthi Co-authored-by: Kenneth Ezirim Co-authored-by: Buke Ao Co-authored-by: Ritwik Das Co-authored-by: dasritwi Co-authored-by: Loki Co-authored-by: rohithkrn Co-authored-by: Rohith Nallamaddi Co-authored-by: Kenneth Ezirim <105572535+kenny-ezirim@users.noreply.github.com> Co-authored-by: haohanchen-yagao <54413235+haohanchen-yagao@users.noreply.github.com> Co-authored-by: Vishwa Karia <45134824+vishwakaria@users.noreply.github.com> Co-authored-by: Nikhil Kulkarni Co-authored-by: Dingheng (Bruce) Zhang Co-authored-by: Harish Tummalacherla Co-authored-by: Bruce Zhang Co-authored-by: jinpengqi <67934931+jinpengqi@users.noreply.github.com> Co-authored-by: ShiboXing Co-authored-by: Shibo Xing Co-authored-by: Danny Key <108494184+dkey-amazon@users.noreply.github.com> Co-authored-by: hballuru <113142824+hballuru@users.noreply.github.com> Co-authored-by: Ohad Katz Co-authored-by: Ohad Katz Co-authored-by: Kristopher Siman Co-authored-by: Kristopher Siman Co-authored-by: Ohad Katz <20158647+ohadkatz@users.noreply.github.com> --- .release_images_template.yml | 2 +- data/ignore_ids_safety_scan.json | 14 + pytorch/buildspec-1-10-neuron.yml | 29 + pytorch/buildspec-1-11-neuron.yml | 59 ++ pytorch/buildspec-neuron.yml | 49 +- .../1.10/py3/sdk2.1.1/Dockerfile.neuron | 200 +++++ .../1.11/py3/sdk2.3.0/Dockerfile.neuron | 197 +++++ .../Dockerfile.neuron.os_scan_allowlist.json | 708 ++++++++++++++++++ release_images.yml | 2 +- src/utils.py | 5 + .../training/test_trcomp_performance.py | 2 +- test/dlc_tests/conftest.py | 26 + .../pytorch_tests/testNeuronSingleAllReduce | 29 + .../pytorch/training/test_pytorch_training.py | 13 +- test/dlc_tests/sanity/test_pre_release.py | 12 +- test/dlc_tests/sanity/test_safety_check.py | 22 +- .../pytorch/training/conftest.py | 25 +- .../pytorch/training/integration/__init__.py | 1 + .../integration/sagemaker/test_neuron.py | 61 ++ .../training/resources/neuron/all_reduce.py | 44 ++ .../training/resources/neuron/entrypoint.py | 54 ++ test/test_utils/__init__.py | 11 + test/test_utils/ec2.py | 16 +- test/test_utils/sagemaker.py | 13 +- test/test_utils/security.py | 2 +- 25 files changed, 1564 insertions(+), 32 deletions(-) create mode 100644 pytorch/buildspec-1-11-neuron.yml create mode 100644 pytorch/training/docker/1.10/py3/sdk2.1.1/Dockerfile.neuron create mode 100644 pytorch/training/docker/1.11/py3/sdk2.3.0/Dockerfile.neuron create mode 100644 pytorch/training/docker/1.11/py3/sdk2.3.0/Dockerfile.neuron.os_scan_allowlist.json create mode 100644 test/dlc_tests/container_tests/bin/pytorch_tests/testNeuronSingleAllReduce create mode 100644 test/sagemaker_tests/pytorch/training/integration/sagemaker/test_neuron.py create mode 100644 test/sagemaker_tests/pytorch/training/resources/neuron/all_reduce.py create mode 100644 test/sagemaker_tests/pytorch/training/resources/neuron/entrypoint.py diff --git a/.release_images_template.yml b/.release_images_template.yml index c08cebc3e927..dacd77db2607 100644 --- a/.release_images_template.yml +++ b/.release_images_template.yml @@ -1058,4 +1058,4 @@ release_images: cuda_version: "cu112" example: False disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False \ No newline at end of file + force_release: False diff --git a/data/ignore_ids_safety_scan.json b/data/ignore_ids_safety_scan.json index f7c75ec196ee..f1e387bcd92f 100644 --- a/data/ignore_ids_safety_scan.json +++ b/data/ignore_ids_safety_scan.json @@ -410,6 +410,20 @@ "51159":"cryptography>38.0.1 does not exist yet" } }, + "training-neuron":{ + "_comment":"py2 is deprecated", + "py2": { + }, + "py3": { + "43453":"numpy > 1.22.0 is not available for py37", + "44715":"numpy > 1.22.0 is not available for py37", + "44717":"numpy > 1.22.0 is not available for py37", + "44716":"numpy > 1.22.0 is not available for py37", + "51159":"cryptography>38.0.1 does not exist yet", + "51358":"Safety is test pkg and not part of image", + "51457":"Ignored- please check https://github.com/pytest-dev/py/issues/287" + } + }, "inference": { "_comment":"py2 is deprecated", "py2": { diff --git a/pytorch/buildspec-1-10-neuron.yml b/pytorch/buildspec-1-10-neuron.yml index 1f29b8e2c55c..f2b3de302893 100644 --- a/pytorch/buildspec-1-10-neuron.yml +++ b/pytorch/buildspec-1-10-neuron.yml @@ -2,10 +2,16 @@ account_id: &ACCOUNT_ID region: ®ION framework: &FRAMEWORK pytorch version: &VERSION 1.10.2 +os_version: &OS_VERSION ubuntu18.04 short_version: &SHORT_VERSION "1.10" arch_type: x86 repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", neuron] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] inference_repository: &INFERENCE_REPOSITORY image_type: &INFERENCE_IMAGE_TYPE inference root: !join [ *FRAMEWORK, "/", *INFERENCE_IMAGE_TYPE ] @@ -13,6 +19,16 @@ repository_info: repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] context: + training_context: &TRAINING_CONTEXT + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py inference_context: &INFERENCE_CONTEXT neuron-monitor: source: docker/build_artifacts/neuron-monitor.sh @@ -28,6 +44,19 @@ context: target: config.properties images: + BuildNeuronPTTrainingPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_INF_TRAINING_PY3 false + image_size_baseline: 5000 + device_type: &DEVICE_TYPE neuron + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py36 + neuron_sdk_version: &NEURON_SDK_VERSION sdk2.1.1 + os_version: &OS_VERSION ubuntu18.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *NEURON_SDK_VERSION, "-", *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *NEURON_SDK_VERSION, /Dockerfile., neuron ] + context: + <<: *TRAINING_CONTEXT BuildNeuronPTInferencePy3DockerImage: <<: *INFERENCE_REPOSITORY build: &PYTORCH_INF_INFERENCE_PY3 false diff --git a/pytorch/buildspec-1-11-neuron.yml b/pytorch/buildspec-1-11-neuron.yml new file mode 100644 index 000000000000..41367536ee9f --- /dev/null +++ b/pytorch/buildspec-1-11-neuron.yml @@ -0,0 +1,59 @@ +account_id: &ACCOUNT_ID +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 1.11.0 +os_version: &OS_VERSION ubuntu20.04 +short_version: &SHORT_VERSION "1.11" +arch_type: x86 + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", neuron] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + inference_repository: &INFERENCE_REPOSITORY + image_type: &INFERENCE_IMAGE_TYPE inference + root: !join [ *FRAMEWORK, "/", *INFERENCE_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *INFERENCE_IMAGE_TYPE, "-", neuron] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + inference_context: &INFERENCE_CONTEXT + neuron-monitor: + source: docker/build_artifacts/neuron-monitor.sh + target: neuron-monitor.sh + neuron-entrypoint: + source: docker/build_artifacts/neuron-entrypoint.py + target: neuron-entrypoint.py + torchserve-neuron: + source: docker/build_artifacts/torchserve-neuron.sh + target: torchserve-neuron.sh + config: + source: docker/build_artifacts/config.properties + target: config.properties + +images: + BuildNeuronPTTrainingPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_INF_TRAINING_PY3 false + image_size_baseline: 10000 + device_type: &DEVICE_TYPE neuron + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py38 + neuron_sdk_version: &NEURON_SDK_VERSION sdk2.3.0 + os_version: &OS_VERSION ubuntu20.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *NEURON_SDK_VERSION, "-", *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *NEURON_SDK_VERSION, /Dockerfile., neuron ] + context: + <<: *TRAINING_CONTEXT \ No newline at end of file diff --git a/pytorch/buildspec-neuron.yml b/pytorch/buildspec-neuron.yml index a2803c2d1752..fcb8ba98bfec 100644 --- a/pytorch/buildspec-neuron.yml +++ b/pytorch/buildspec-neuron.yml @@ -1,11 +1,17 @@ account_id: &ACCOUNT_ID region: ®ION framework: &FRAMEWORK pytorch -version: &VERSION 1.10.2 -short_version: &SHORT_VERSION "1.10" +version: &VERSION 1.11.0 +os_version: &OS_VERSION ubuntu20.04 +short_version: &SHORT_VERSION "1.11" arch_type: x86 repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", neuron] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] inference_repository: &INFERENCE_REPOSITORY image_type: &INFERENCE_IMAGE_TYPE inference root: !join [ *FRAMEWORK, "/", *INFERENCE_IMAGE_TYPE ] @@ -13,6 +19,16 @@ repository_info: repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] context: + training_context: &TRAINING_CONTEXT + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py inference_context: &INFERENCE_CONTEXT neuron-monitor: source: docker/build_artifacts/neuron-monitor.sh @@ -28,18 +44,29 @@ context: target: config.properties images: - BuildNeuronPTInferencePy3DockerImage: - <<: *INFERENCE_REPOSITORY - build: &PYTORCH_INF_INFERENCE_PY3 false + # BuildNeuronPTInferencePy3DockerImage: + # <<: *INFERENCE_REPOSITORY + # build: &PYTORCH_INF_INFERENCE_PY3 false + # image_size_baseline: 10000 + # device_type: &DEVICE_TYPE neuron + # python_version: &DOCKER_PYTHON_VERSION py3 + # tag_python_version: &TAG_PYTHON_VERSION py37 + # os_version: &OS_VERSION ubuntu18.04 + # neuron_sdk_version: &NEURON_SDK_VERSION sdk1.19.0 + # tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *NEURON_SDK_VERSION, "-", *OS_VERSION ] + # docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *NEURON_SDK_VERSION, /Dockerfile., neuron ] + # context: + # <<: *INFERENCE_CONTEXT + BuildNeuronPTTrainingPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_INF_TRAINING_PY3 false image_size_baseline: 10000 device_type: &DEVICE_TYPE neuron python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py37 - os_version: &OS_VERSION ubuntu18.04 - neuron_sdk_version: &NEURON_SDK_VERSION sdk1.19.0 + tag_python_version: &TAG_PYTHON_VERSION py38 + neuron_sdk_version: &NEURON_SDK_VERSION sdk2.3.0 + os_version: &OS_VERSION ubuntu20.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *NEURON_SDK_VERSION, "-", *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *NEURON_SDK_VERSION, /Dockerfile., neuron ] context: - <<: *INFERENCE_CONTEXT - - + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/docker/1.10/py3/sdk2.1.1/Dockerfile.neuron b/pytorch/training/docker/1.10/py3/sdk2.1.1/Dockerfile.neuron new file mode 100644 index 000000000000..5e9aae0a2096 --- /dev/null +++ b/pytorch/training/docker/1.10/py3/sdk2.1.1/Dockerfile.neuron @@ -0,0 +1,200 @@ +FROM ubuntu:18.04 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON=python3.7 +# Add arguments to achieve the version, python and url +ARG PYTHON_VERSION=3.7.10 +ARG OPEN_MPI_VERSION=4.0.1 +ARG PIP=pip3 + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ARG DEBIAN_FRONTEND=noninteractive + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main +ENV DGLBACKEND=pytorch + +RUN apt-get update \ + # TODO: Remove systemd upgrade once it is updated in base image + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libopencv-dev \ + openjdk-8-jdk-headless \ + openjdk-8-jdk \ + openjdk-8-jre \ + libglib2.0-0 \ + libgl1-mesa-glx \ + libsm6 \ + libxext6 \ + libxrender-dev \ + openjdk-11-jdk \ + software-properties-common \ + wget \ + unzip \ + vim \ + zlib1g-dev \ + openssl \ + libssl-dev \ + libsqlite3-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + tk-dev \ + libffi-dev \ + libcap-dev \ + gnupg2 \ + gpg-agent \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +RUN echo "deb https://NeuronUser:ZOgdZY3PLBTOwMX1@apt.repos.beta.sunda.neuron.annapurna.aws.a2z.com bionic main" > /etc/apt/sources.list.d/neuron.list +RUN wget -qO - https://NeuronUser:ZOgdZY3PLBTOwMX1@apt.repos.beta.sunda.neuron.annapurna.aws.a2z.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-$OPEN_MPI_VERSION.tar.gz \ + && gunzip -c openmpi-$OPEN_MPI_VERSION.tar.gz | tar xf - \ + && cd openmpi-$OPEN_MPI_VERSION \ + && ./configure --prefix=/home/.openmpi \ + && make all install \ + && cd .. \ + && rm openmpi-$OPEN_MPI_VERSION.tar.gz \ + && rm -rf openmpi-$OPEN_MPI_VERSION + +# install Python +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xzf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure --enable-shared --prefix=/usr/local \ + && make -j $(nproc) && make install \ + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ + && ${PIP} --no-cache-dir install --upgrade \ + pip \ + setuptools + +WORKDIR / + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="$PATH:/home/.openmpi/bin" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh + + +RUN ${PIP} install --no-cache-dir -U \ + "pyyaml>=5.4,<5.5" \ + "bokeh>=2.3,<3" \ + "opencv-python>=4.6,<5" \ + "awscli<2" \ + scipy \ + click \ + "cryptography>3.2" \ + "sagemaker>=2,<3" \ + "sagemaker-pytorch-training<3" \ + psutil==5.6.7 \ + dataset \ + transformers \ + 'Pillow>=9.0.1,<9.1.0' + +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt +RUN ${PIP} config set global.extra-index-url "https://NeuronUser:ZOgdZY3PLBTOwMX1@pip.repos.beta.sunda.neuron.annapurna.aws.a2z.com" \ + && ${PIP} install --force-reinstall torch-neuronx==1.10.2.* --extra-index-url https://NeuronUser:ZOgdZY3PLBTOwMX1@pip.repos.beta.sunda.neuron.annapurna.aws.a2z.com \ + && ${PIP} install --force-reinstall neuronx-cc==2.* --extra-index-url https://NeuronUser:ZOgdZY3PLBTOwMX1@pip.repos.beta.sunda.neuron.annapurna.aws.a2z.com + +# attrs, neurox-cc required: >=19.2.0, sagemaker 2.103.0 <22,>=20.3.0 +# protobuf neurox-cc<4 , sagemaker training <3.20,>=3.9.2 +# awscli 1.25.47 has requirement docutils<0.17,>=0.10 +# etcd for kubernetes installation +RUN ${PIP} install --no-cache-dir -U \ + 'attrs>=20.3.0,<22.0.0' \ + 'protobuf>=2.9.2,<3.20' \ + 'docutils>=0.10,<0.17' \ + "python-etcd" + +# Install extra packages needed by sagemaker (for passing test_utility_packages_using_import) +RUN pip install --no-cache-dir -U \ + "bokeh>=2.3,<3" \ + "imageio>=2.9,<3" \ + "opencv-python>=4.3,<5" \ + "plotly>=5.1,<6" \ + "seaborn>=0.11,<1" \ + "numba<0.54" \ + "shap>=0.39,<1" + +# EFA Installer does apt get. Make sure to run apt update before that +RUN apt-get update +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && cd $HOME + + +## Install neuron collectives library +RUN apt-get install -f aws-neuronx-collectives + +## Install neurun runtime-lib +RUN apt-get install -f aws-neuronx-runtime-lib + + +# Clean up after apt update +RUN rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.9/license.txt + +# Starts framework +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] diff --git a/pytorch/training/docker/1.11/py3/sdk2.3.0/Dockerfile.neuron b/pytorch/training/docker/1.11/py3/sdk2.3.0/Dockerfile.neuron new file mode 100644 index 000000000000..a5ff1b51c13a --- /dev/null +++ b/pytorch/training/docker/1.11/py3/sdk2.3.0/Dockerfile.neuron @@ -0,0 +1,197 @@ +FROM ubuntu:20.04 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON=python3.8 +# Add arguments to achieve the version, python and url +ARG PYTHON_VERSION=3.8.10 +# Add arguments to achieve the version, python and url +ARG OPEN_MPI_VERSION=4.0.1 +ARG PIP=pip3 + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ARG DEBIAN_FRONTEND=noninteractive + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" +ENV PATH /opt/aws/neuron/bin/:$PATH +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main +ENV DGLBACKEND=pytorch + +RUN apt-get update \ + # TODO: Remove systemd upgrade once it is updated in base image + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libopencv-dev \ + openjdk-8-jdk-headless \ + openjdk-8-jdk \ + openjdk-8-jre \ + libglib2.0-0 \ + libgl1-mesa-glx \ + libsm6 \ + libxext6 \ + libxrender-dev \ + openjdk-11-jdk \ + software-properties-common \ + wget \ + unzip \ + vim \ + zlib1g-dev \ + openssl \ + libssl-dev \ + libsqlite3-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + tk-dev \ + libffi-dev \ + libcap-dev \ + gnupg2 \ + gpg-agent \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list +RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools \ + aws-neuronx-collectives \ + aws-neuronx-runtime-lib \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-$OPEN_MPI_VERSION.tar.gz \ + && gunzip -c openmpi-$OPEN_MPI_VERSION.tar.gz | tar xf - \ + && cd openmpi-$OPEN_MPI_VERSION \ + && ./configure --prefix=/home/.openmpi \ + && make all install \ + && cd .. \ + && rm openmpi-$OPEN_MPI_VERSION.tar.gz \ + && rm -rf openmpi-$OPEN_MPI_VERSION + +# install Python +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xzf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure --enable-shared --prefix=/usr/local \ + && make -j $(nproc) && make install \ + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ + && ${PIP} --no-cache-dir install --upgrade \ + pip \ + setuptools + +WORKDIR / + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="$PATH:/home/.openmpi/bin" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh + +RUN ${PIP} install --no-cache-dir -U \ + "pyyaml>=5.4,<5.5" \ + "bokeh>=2.3,<3" \ + "opencv-python>=4.6,<5" \ + "awscli<2" \ + scipy \ + click \ + "cryptography>3.2" \ + "sagemaker>=2,<3" \ + "sagemaker-pytorch-training<3" \ + psutil==5.6.7 \ + dataset \ + transformers \ + 'Pillow>=9.0.1,<9.1.0' + +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt +RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall torch-neuronx==1.11.0.* --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall neuronx-cc==2.* --extra-index-url https://pip.repos.neuron.amazonaws.com + +# attrs, neurox-cc required: >=19.2.0, sagemaker 2.103.0 <22,>=20.3.0 +# protobuf neurox-cc<4 , sagemaker training <3.20,>=3.9.2 +# awscli 1.25.47 has requirement docutils<0.17,>=0.10 +# etcd for kubernetes installation +RUN ${PIP} install --no-cache-dir -U \ + 'attrs>=20.3.0,<22.0.0' \ + 'protobuf>=2.9.2,<3.20' \ + 'docutils>=0.10,<0.17' \ + "python-etcd" + +# Install extra packages needed by sagemaker (for passing test_utility_packages_using_import) +RUN pip install --no-cache-dir -U \ + "bokeh>=2.3,<3" \ + "imageio>=2.9,<3" \ + "opencv-python>=4.3,<5" \ + "plotly>=5.1,<6" \ + "seaborn>=0.11,<1" \ + "numba<0.54" \ + "shap>=0.39,<1" \ + "numpy<=1.20.0,>=1.13.3" + +# EFA Installer does apt get. Make sure to run apt update before that +RUN apt-get update +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && cd $HOME + + +# Clean up after apt update +RUN rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \ + && chmod +x /usr/local/bin/deep_learning_container.py + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.9/license.txt + +# Starts framework +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] diff --git a/pytorch/training/docker/1.11/py3/sdk2.3.0/Dockerfile.neuron.os_scan_allowlist.json b/pytorch/training/docker/1.11/py3/sdk2.3.0/Dockerfile.neuron.os_scan_allowlist.json new file mode 100644 index 000000000000..817be8a97da7 --- /dev/null +++ b/pytorch/training/docker/1.11/py3/sdk2.3.0/Dockerfile.neuron.os_scan_allowlist.json @@ -0,0 +1,708 @@ +{ + "github.com/satori/go.uuid": [ + { + "description": "[github.com/satori/go.uuid](https://github.com/satori/go.uuid) is a provides pure Go implementation of Universally Unique Identifier (UUID).\n\nAffected versions of this package are vulnerable to Insecure Randomness producing predictable `UUID` identifiers due to the limited number of bytes read when using the `g.rand.Read` function.\r\n\r\n**Note:** This vulnerability was introduced on the master branch of `satori/go.uuid` after version 1.2.0. This was fixed on commit d91630c8510268e75203009fe7daf2b8e1d60c45.", + "vulnerability_id": "IN1-GOLANG-GITHUBCOMSATORIGOUUID-72488", + "name": "IN1-GOLANG-GITHUBCOMSATORIGOUUID-72488", + "package_name": "github.com/satori/go.uuid", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-monitor", + "name": "github.com/satori/go.uuid", + "package_manager": "GOBINARY", + "version": "v1.2.0", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 8.1, + "cvss_v30_score": 0, + "cvss_v31_score": 8.1, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://snyk.io/vuln/SNYK-GOLANG-GITHUBCOMSATORIGOUUID-72488", + "source": "SNYK", + "severity": "HIGH", + "status": "ACTIVE", + "title": "SNYK-GOLANG-GITHUBCOMSATORIGOUUID-72488 - github.com/satori/go.uuid, github.com/satori/go.uuid and 1 more" + }, + { + "description": "[github.com/satori/go.uuid](https://github.com/satori/go.uuid) is a provides pure Go implementation of Universally Unique Identifier (UUID).\n\nAffected versions of this package are vulnerable to Insecure Randomness producing predictable `UUID` identifiers due to the limited number of bytes read when using the `g.rand.Read` function.\r\n\r\n**Note:** This vulnerability was introduced on the master branch of `satori/go.uuid` after version 1.2.0. This was fixed on commit d91630c8510268e75203009fe7daf2b8e1d60c45.", + "vulnerability_id": "IN1-GOLANG-GITHUBCOMSATORIGOUUID-72488", + "name": "IN1-GOLANG-GITHUBCOMSATORIGOUUID-72488", + "package_name": "github.com/satori/go.uuid", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-profile", + "name": "github.com/satori/go.uuid", + "package_manager": "GOBINARY", + "version": "v1.2.0", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 8.1, + "cvss_v30_score": 0, + "cvss_v31_score": 8.1, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://snyk.io/vuln/SNYK-GOLANG-GITHUBCOMSATORIGOUUID-72488", + "source": "SNYK", + "severity": "HIGH", + "status": "ACTIVE", + "title": "SNYK-GOLANG-GITHUBCOMSATORIGOUUID-72488 - github.com/satori/go.uuid, github.com/satori/go.uuid and 1 more" + }, + { + "description": "[github.com/satori/go.uuid](https://github.com/satori/go.uuid) is a provides pure Go implementation of Universally Unique Identifier (UUID).\n\nAffected versions of this package are vulnerable to Insecure Randomness producing predictable `UUID` identifiers due to the limited number of bytes read when using the `g.rand.Read` function.\r\n\r\n**Note:** This vulnerability was introduced on the master branch of `satori/go.uuid` after version 1.2.0. This was fixed on commit d91630c8510268e75203009fe7daf2b8e1d60c45.", + "vulnerability_id": "IN1-GOLANG-GITHUBCOMSATORIGOUUID-72488", + "name": "IN1-GOLANG-GITHUBCOMSATORIGOUUID-72488", + "package_name": "github.com/satori/go.uuid", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-ls", + "name": "github.com/satori/go.uuid", + "package_manager": "GOBINARY", + "version": "v1.2.0", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 8.1, + "cvss_v30_score": 0, + "cvss_v31_score": 8.1, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://snyk.io/vuln/SNYK-GOLANG-GITHUBCOMSATORIGOUUID-72488", + "source": "SNYK", + "severity": "HIGH", + "status": "ACTIVE", + "title": "SNYK-GOLANG-GITHUBCOMSATORIGOUUID-72488 - github.com/satori/go.uuid, github.com/satori/go.uuid and 1 more" + }, + { + "description": " A flaw was found in github.com/satori/go.uuid in versions from commit 0ef6afb2f6cdd6cdaeee3885a95099c63f18fc8c to d91630c8510268e75203009fe7daf2b8e1d60c45. Due to insecure randomness in the g.rand.Read function the generated UUIDs are predictable for an attacker.", + "vulnerability_id": "CVE-2021-3538", + "name": "CVE-2021-3538", + "package_name": "github.com/satori/go.uuid", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-monitor", + "name": "github.com/satori/go.uuid", + "package_manager": "GOBINARY", + "version": "v1.2.0", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 9.8, + "cvss_v30_score": 0, + "cvss_v31_score": 9.8, + "cvss_v2_score": 7.5, + "cvss_v3_severity": "CRITICAL", + "source_url": "https://people.canonical.com/~ubuntu-security/cve/2021/CVE-2021-3538.html", + "source": "UBUNTU_CVE", + "severity": "MEDIUM", + "status": "ACTIVE", + "title": "CVE-2021-3538 - github.com/satori/go.uuid, github.com/satori/go.uuid and 1 more" + }, + { + "description": " A flaw was found in github.com/satori/go.uuid in versions from commit 0ef6afb2f6cdd6cdaeee3885a95099c63f18fc8c to d91630c8510268e75203009fe7daf2b8e1d60c45. Due to insecure randomness in the g.rand.Read function the generated UUIDs are predictable for an attacker.", + "vulnerability_id": "CVE-2021-3538", + "name": "CVE-2021-3538", + "package_name": "github.com/satori/go.uuid", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-profile", + "name": "github.com/satori/go.uuid", + "package_manager": "GOBINARY", + "version": "v1.2.0", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 9.8, + "cvss_v30_score": 0, + "cvss_v31_score": 9.8, + "cvss_v2_score": 7.5, + "cvss_v3_severity": "CRITICAL", + "source_url": "https://people.canonical.com/~ubuntu-security/cve/2021/CVE-2021-3538.html", + "source": "UBUNTU_CVE", + "severity": "MEDIUM", + "status": "ACTIVE", + "title": "CVE-2021-3538 - github.com/satori/go.uuid, github.com/satori/go.uuid and 1 more" + }, + { + "description": " A flaw was found in github.com/satori/go.uuid in versions from commit 0ef6afb2f6cdd6cdaeee3885a95099c63f18fc8c to d91630c8510268e75203009fe7daf2b8e1d60c45. Due to insecure randomness in the g.rand.Read function the generated UUIDs are predictable for an attacker.", + "vulnerability_id": "CVE-2021-3538", + "name": "CVE-2021-3538", + "package_name": "github.com/satori/go.uuid", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-ls", + "name": "github.com/satori/go.uuid", + "package_manager": "GOBINARY", + "version": "v1.2.0", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 9.8, + "cvss_v30_score": 0, + "cvss_v31_score": 9.8, + "cvss_v2_score": 7.5, + "cvss_v3_severity": "CRITICAL", + "source_url": "https://people.canonical.com/~ubuntu-security/cve/2021/CVE-2021-3538.html", + "source": "UBUNTU_CVE", + "severity": "MEDIUM", + "status": "ACTIVE", + "title": "CVE-2021-3538 - github.com/satori/go.uuid, github.com/satori/go.uuid and 1 more" + } + ], + "github.com/ulikunitz/xz": [ + { + "description": "[github.com/ulikunitz/xz](https://github.com/ulikunitz/xz) is a package for reading and writing of xz compressed streams.\n\nAffected versions of this package are vulnerable to Infinite Loop. An attacker could construct a byte sequence so that `readUvarint` would not stop to consume bytes.", + "vulnerability_id": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-598892", + "name": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-598892", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-ls", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://snyk.io/vuln/SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-598892", + "source": "SNYK", + "severity": "HIGH", + "status": "ACTIVE", + "title": "SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-598892 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": "[github.com/ulikunitz/xz](https://github.com/ulikunitz/xz) is a package for reading and writing of xz compressed streams.\n\nAffected versions of this package are vulnerable to Infinite Loop. An attacker could construct a byte sequence so that `readUvarint` would not stop to consume bytes.", + "vulnerability_id": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-598892", + "name": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-598892", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-profile", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://snyk.io/vuln/SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-598892", + "source": "SNYK", + "severity": "HIGH", + "status": "ACTIVE", + "title": "SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-598892 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": "[github.com/ulikunitz/xz](https://github.com/ulikunitz/xz) is a package for reading and writing of xz compressed streams.\n\nAffected versions of this package are vulnerable to Infinite Loop. An attacker could construct a byte sequence so that `readUvarint` would not stop to consume bytes.", + "vulnerability_id": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-598892", + "name": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-598892", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-monitor", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://snyk.io/vuln/SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-598892", + "source": "SNYK", + "severity": "HIGH", + "status": "ACTIVE", + "title": "SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-598892 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": "[github.com/ulikunitz/xz](https://github.com/ulikunitz/xz) is a package for reading and writing of xz compressed streams.\n\nAffected versions of this package are vulnerable to Denial of Service (DoS). It is possible create an infinite read loop due to the usage of the `ReadUvarint` and `ReadVarint` function when encoding/binary via invalid inputs.", + "vulnerability_id": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-607912", + "name": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-607912", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-ls", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://snyk.io/vuln/SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-607912", + "source": "SNYK", + "severity": "HIGH", + "status": "ACTIVE", + "title": "SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-607912 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": "[github.com/ulikunitz/xz](https://github.com/ulikunitz/xz) is a package for reading and writing of xz compressed streams.\n\nAffected versions of this package are vulnerable to Denial of Service (DoS). It is possible create an infinite read loop due to the usage of the `ReadUvarint` and `ReadVarint` function when encoding/binary via invalid inputs.", + "vulnerability_id": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-607912", + "name": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-607912", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-profile", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://snyk.io/vuln/SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-607912", + "source": "SNYK", + "severity": "HIGH", + "status": "ACTIVE", + "title": "SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-607912 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": "[github.com/ulikunitz/xz](https://github.com/ulikunitz/xz) is a package for reading and writing of xz compressed streams.\n\nAffected versions of this package are vulnerable to Denial of Service (DoS). It is possible create an infinite read loop due to the usage of the `ReadUvarint` and `ReadVarint` function when encoding/binary via invalid inputs.", + "vulnerability_id": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-607912", + "name": "IN1-GOLANG-GITHUBCOMULIKUNITZXZ-607912", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-monitor", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://snyk.io/vuln/SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-607912", + "source": "SNYK", + "severity": "HIGH", + "status": "ACTIVE", + "title": "SNYK-GOLANG-GITHUBCOMULIKUNITZXZ-607912 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": " xz is a compression and decompression library focusing on the xz format completely written in Go. The function readUvarint used to read the xz container format may not terminate a loop provide malicous input. The problem has been fixed in release v0.5.8. As a workaround users can limit the size of the compressed file input to a reasonable size for their use case. The standard library had recently the same issue and got the CVE-2020-16845 allocated.", + "vulnerability_id": "CVE-2021-29482", + "name": "CVE-2021-29482", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-ls", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 5, + "cvss_v3_severity": "HIGH", + "source_url": "https://people.canonical.com/~ubuntu-security/cve/2021/CVE-2021-29482.html", + "source": "UBUNTU_CVE", + "severity": "MEDIUM", + "status": "ACTIVE", + "title": "CVE-2021-29482 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": " xz is a compression and decompression library focusing on the xz format completely written in Go. The function readUvarint used to read the xz container format may not terminate a loop provide malicous input. The problem has been fixed in release v0.5.8. As a workaround users can limit the size of the compressed file input to a reasonable size for their use case. The standard library had recently the same issue and got the CVE-2020-16845 allocated.", + "vulnerability_id": "CVE-2021-29482", + "name": "CVE-2021-29482", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-profile", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 5, + "cvss_v3_severity": "HIGH", + "source_url": "https://people.canonical.com/~ubuntu-security/cve/2021/CVE-2021-29482.html", + "source": "UBUNTU_CVE", + "severity": "MEDIUM", + "status": "ACTIVE", + "title": "CVE-2021-29482 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": " xz is a compression and decompression library focusing on the xz format completely written in Go. The function readUvarint used to read the xz container format may not terminate a loop provide malicous input. The problem has been fixed in release v0.5.8. As a workaround users can limit the size of the compressed file input to a reasonable size for their use case. The standard library had recently the same issue and got the CVE-2020-16845 allocated.", + "vulnerability_id": "CVE-2021-29482", + "name": "CVE-2021-29482", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-monitor", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 5, + "cvss_v3_severity": "HIGH", + "source_url": "https://people.canonical.com/~ubuntu-security/cve/2021/CVE-2021-29482.html", + "source": "UBUNTU_CVE", + "severity": "MEDIUM", + "status": "ACTIVE", + "title": "CVE-2021-29482 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": "Go before 1.13.15 and 14.x before 1.14.7 can have an infinite read loop in ReadUvarint and ReadVarint in encoding/binary via invalid inputs.", + "vulnerability_id": "GHSA-q6gq-997w-f55g", + "name": "GHSA-q6gq-997w-f55g", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-ls", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://github.com/advisories/GHSA-q6gq-997w-f55g", + "source": "GITHUB", + "severity": "HIGH", + "status": "ACTIVE", + "title": "GHSA-q6gq-997w-f55g - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": "Go before 1.13.15 and 14.x before 1.14.7 can have an infinite read loop in ReadUvarint and ReadVarint in encoding/binary via invalid inputs.", + "vulnerability_id": "GHSA-q6gq-997w-f55g", + "name": "GHSA-q6gq-997w-f55g", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-profile", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://github.com/advisories/GHSA-q6gq-997w-f55g", + "source": "GITHUB", + "severity": "HIGH", + "status": "ACTIVE", + "title": "GHSA-q6gq-997w-f55g - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": "Go before 1.13.15 and 14.x before 1.14.7 can have an infinite read loop in ReadUvarint and ReadVarint in encoding/binary via invalid inputs.", + "vulnerability_id": "GHSA-q6gq-997w-f55g", + "name": "GHSA-q6gq-997w-f55g", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-monitor", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://github.com/advisories/GHSA-q6gq-997w-f55g", + "source": "GITHUB", + "severity": "HIGH", + "status": "ACTIVE", + "title": "GHSA-q6gq-997w-f55g - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": " Go before 1.13.15 and 14.x before 1.14.7 can have an infinite read loop in ReadUvarint and ReadVarint in encoding/binary via invalid inputs.", + "vulnerability_id": "CVE-2020-16845", + "name": "CVE-2020-16845", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-ls", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 5, + "cvss_v3_severity": "HIGH", + "source_url": "https://people.canonical.com/~ubuntu-security/cve/2020/CVE-2020-16845.html", + "source": "UBUNTU_CVE", + "severity": "LOW", + "status": "ACTIVE", + "title": "CVE-2020-16845 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": " Go before 1.13.15 and 14.x before 1.14.7 can have an infinite read loop in ReadUvarint and ReadVarint in encoding/binary via invalid inputs.", + "vulnerability_id": "CVE-2020-16845", + "name": "CVE-2020-16845", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-profile", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 5, + "cvss_v3_severity": "HIGH", + "source_url": "https://people.canonical.com/~ubuntu-security/cve/2020/CVE-2020-16845.html", + "source": "UBUNTU_CVE", + "severity": "LOW", + "status": "ACTIVE", + "title": "CVE-2020-16845 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": " Go before 1.13.15 and 14.x before 1.14.7 can have an infinite read loop in ReadUvarint and ReadVarint in encoding/binary via invalid inputs.", + "vulnerability_id": "CVE-2020-16845", + "name": "CVE-2020-16845", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-monitor", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 5, + "cvss_v3_severity": "HIGH", + "source_url": "https://people.canonical.com/~ubuntu-security/cve/2020/CVE-2020-16845.html", + "source": "UBUNTU_CVE", + "severity": "LOW", + "status": "ACTIVE", + "title": "CVE-2020-16845 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": "### Impact\n\nThe function readUvarint used to read the xz container format may not terminate a loop provide malicous input.\n\n### Patches\n\nThe problem has been fixed in release v0.5.8.\n\n### Workarounds\n\nLimit the size of the compressed file input to a reasonable size for your use case.\n\n### References\n\nThe standard library had recently the same issue and got the [CVE-2020-16845](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-16845) allocated.\n\n### For more information\nIf you have any questions or comments about this advisory:\n* Open an issue in [xz](https://github.com/ulikunitz/xz/issues).", + "vulnerability_id": "GHSA-25xm-hr59-7c27", + "name": "GHSA-25xm-hr59-7c27", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-ls", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://github.com/advisories/GHSA-25xm-hr59-7c27", + "source": "GITHUB", + "severity": "HIGH", + "status": "ACTIVE", + "title": "GHSA-25xm-hr59-7c27 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": "### Impact\n\nThe function readUvarint used to read the xz container format may not terminate a loop provide malicous input.\n\n### Patches\n\nThe problem has been fixed in release v0.5.8.\n\n### Workarounds\n\nLimit the size of the compressed file input to a reasonable size for your use case.\n\n### References\n\nThe standard library had recently the same issue and got the [CVE-2020-16845](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-16845) allocated.\n\n### For more information\nIf you have any questions or comments about this advisory:\n* Open an issue in [xz](https://github.com/ulikunitz/xz/issues).", + "vulnerability_id": "GHSA-25xm-hr59-7c27", + "name": "GHSA-25xm-hr59-7c27", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-profile", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://github.com/advisories/GHSA-25xm-hr59-7c27", + "source": "GITHUB", + "severity": "HIGH", + "status": "ACTIVE", + "title": "GHSA-25xm-hr59-7c27 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + }, + { + "description": "### Impact\n\nThe function readUvarint used to read the xz container format may not terminate a loop provide malicous input.\n\n### Patches\n\nThe problem has been fixed in release v0.5.8.\n\n### Workarounds\n\nLimit the size of the compressed file input to a reasonable size for your use case.\n\n### References\n\nThe standard library had recently the same issue and got the [CVE-2020-16845](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-16845) allocated.\n\n### For more information\nIf you have any questions or comments about this advisory:\n* Open an issue in [xz](https://github.com/ulikunitz/xz/issues).", + "vulnerability_id": "GHSA-25xm-hr59-7c27", + "name": "GHSA-25xm-hr59-7c27", + "package_name": "github.com/ulikunitz/xz", + "package_details": { + "file_path": "opt/aws/neuron/bin/neuron-monitor", + "name": "github.com/ulikunitz/xz", + "package_manager": "GOBINARY", + "version": "v0.5.6", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 7.5, + "cvss_v30_score": 0, + "cvss_v31_score": 7.5, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://github.com/advisories/GHSA-25xm-hr59-7c27", + "source": "GITHUB", + "severity": "HIGH", + "status": "ACTIVE", + "title": "GHSA-25xm-hr59-7c27 - github.com/ulikunitz/xz, github.com/ulikunitz/xz and 1 more" + } + ], + "networkx": [ + { + "description": "[networkx](https://pypi.org/project/networkx/) is a Python package for creating and manipulating graphs and networks\n\nAffected versions of this package are vulnerable to Deserialization of Untrusted Data. This package is vulnerable to arbitrary code execution via insecure YAML deserialization due to the use of a known vulnerable function `load()` in yaml, which is called in `read_yaml()` in `networkx/readwrite/nx_yaml.py`.\r\n\r\n`networkx/readwrite/nx_yaml.py` is deprecated and scheduled to be removed in the next release of Networkx. Users should avoid using this function completely.", + "vulnerability_id": "IN1-PYTHON-NETWORKX-1062709", + "name": "IN1-PYTHON-NETWORKX-1062709", + "package_name": "networkx", + "package_details": { + "file_path": "usr/local/lib/python3.8/site-packages/networkx-2.5.dist-info/METADATA", + "name": "networkx", + "package_manager": "PYTHONPKG", + "version": "2.5", + "release": null + }, + "remediation": { + "recommendation": { + "text": "None Provided" + } + }, + "cvss_v3_score": 8.8, + "cvss_v30_score": 0, + "cvss_v31_score": 8.8, + "cvss_v2_score": 0, + "cvss_v3_severity": "HIGH", + "source_url": "https://snyk.io/vuln/SNYK-PYTHON-NETWORKX-1062709", + "source": "SNYK", + "severity": "HIGH", + "status": "ACTIVE", + "title": "SNYK-PYTHON-NETWORKX-1062709 - networkx" + } + ] +} \ No newline at end of file diff --git a/release_images.yml b/release_images.yml index 9e338c6e4212..fc4fcc95b3e6 100644 --- a/release_images.yml +++ b/release_images.yml @@ -54,4 +54,4 @@ release_images: disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached # to images being published. force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. \ No newline at end of file + # has already been published. Re-released image will have minor version incremented by 1. diff --git a/src/utils.py b/src/utils.py index e209037f22a8..20b1bc1fc121 100644 --- a/src/utils.py +++ b/src/utils.py @@ -523,6 +523,11 @@ def get_safety_ignore_dict(image_uri, framework, python_version, job_type): "inference-eia" if "eia" in image_uri else "inference-neuron" if "neuron" in image_uri else "inference" ) + if job_type == "training": + job_type = ( + "training-neuron" if "neuron" in image_uri else "training" + ) + if "habana" in image_uri: framework = f"habana_{framework}" diff --git a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_trcomp_performance.py b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_trcomp_performance.py index 6b97f1502ab0..288a7c9660f6 100644 --- a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_trcomp_performance.py +++ b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_trcomp_performance.py @@ -235,4 +235,4 @@ def test_gpt2(self, instance_type, num_gpus, total_n_gpus, instance_count, tenso ) LOGGER.info(result) assert billable>=1000, 'False Positive '+result - assert billable<=threshold, result \ No newline at end of file + assert billable<=threshold, result diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index 60d0d015d2e1..1dd5d5bfa801 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -4,6 +4,7 @@ import random import sys import re +import time import uuid import boto3 from botocore.exceptions import ClientError @@ -27,6 +28,7 @@ is_nightly_context, DEFAULT_REGION, P3DN_REGION, + TRN1_REGION, UBUNTU_18_BASE_DLAMI_US_EAST_1, UBUNTU_18_BASE_DLAMI_US_WEST_2, PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1, @@ -57,12 +59,14 @@ "pytorch_inference", "pytorch_inference_eia", "pytorch_inference_neuron", + "pytorch_training_neuron", "pytorch_inference_graviton", # TensorFlow "tensorflow_training", "tensorflow_inference", "tensorflow_inference_eia", "tensorflow_inference_neuron", + "tensorflow_training_neuron", "tensorflow_training_habana", "tensorflow_inference_graviton", # MxNET @@ -70,6 +74,7 @@ "mxnet_inference", "mxnet_inference_eia", "mxnet_inference_neuron", + "mxnet_training_neuron", "mxnet_inference_graviton", # HuggingFace "huggingface_tensorflow_training", @@ -289,6 +294,12 @@ def ec2_instance( if ec2_instance_ami == AML2_GPU_DLAMI_US_WEST_2 else UBUNTU_18_BASE_DLAMI_US_EAST_1 ) + + if ec2_instance_type == "trn1.32xlarge" or ec2_instance_type == "trn1.2xlarge": + region = TRN1_REGION + ec2_client = boto3.client("ec2", region_name=region, config=Config(retries={"max_attempts": 10})) + ec2_resource = boto3.resource("ec2", region_name=region, config=Config(retries={"max_attempts": 10})) + ec2_key_name = f"{ec2_key_name}-{str(uuid.uuid4())}" print(f"Creating instance: CI-CD {ec2_key_name}") @@ -348,6 +359,12 @@ def delete_ssh_keypair(): # Using private AMI, the EBS volume size is reduced to 28GB as opposed to 50GB from public AMI. This leads to space issues on test instances # TODO: Revert the configuration once DLAMI is public params["BlockDeviceMappings"] = [{"DeviceName": volume_name, "Ebs": {"VolumeSize": 90,},}] + + # For TRN1 since we are using a private AMI that has some BERT data/tests, have a bifgger volume size + # Once use DLAMI, this can be removed + if ec2_instance_type == "trn1.32xlarge" or ec2_instance_type == "trn1.2xlarge": + params["BlockDeviceMappings"] = [{"DeviceName": volume_name, "Ebs": {"VolumeSize": 1024,},}] + if ei_accelerator_type: params["ElasticInferenceAccelerators"] = [{"Type": ei_accelerator_type, "Count": 1}] availability_zones = { @@ -392,6 +409,7 @@ def is_neuron_image(fixtures): :return: bool """ neuron_fixtures = ["tensorflow_inference_neuron", "mxnet_inference_neuron", "pytorch_inference_neuron"] + neuron_fixtures += ["tensorflow_training_neuron", "mxnet_training_neuron", "pytorch_training_neuron"] for fixture in neuron_fixtures: if fixture in fixtures: @@ -412,9 +430,14 @@ def ec2_connection(request, ec2_instance, ec2_key_name, ec2_instance_type, regio """ instance_id, instance_pem_file = ec2_instance region = P3DN_REGION if ec2_instance_type == "p3dn.24xlarge" else region + region = TRN1_REGION if ec2_instance_type == "trn1.32xlarge" or ec2_instance_type == "trn1.2xlarge" else region ip_address = ec2_utils.get_public_ip(instance_id, region=region) LOGGER.info(f"Instance ip_address: {ip_address}") user = ec2_utils.get_instance_user(instance_id, region=region) + + # Hack for the time being. Seeing that the instance gets rebooted after it comes up for some reason + if "pytorch_training_neuron" in request.fixturenames: + time.sleep(300) LOGGER.info(f"Connecting to {user}@{ip_address}") conn = Connection( user=user, host=ip_address, connect_kwargs={"key_filename": [instance_pem_file]}, connect_timeout=18000, @@ -433,6 +456,9 @@ def delete_s3_artifact_copy(): conn.run(f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} $HOME/container_tests") conn.run(f"mkdir -p $HOME/container_tests/logs && chmod -R +x $HOME/container_tests/*") + # Since using old driver that has a bug on reboot, have to do this here + if "pytorch_training_neuron" in request.fixturenames: + conn.run(f"sudo modprobe -r neuron && sudo modprobe -i neuron") # Log into ECR if we are in canary context if test_utils.is_canary_context(): diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/testNeuronSingleAllReduce b/test/dlc_tests/container_tests/bin/pytorch_tests/testNeuronSingleAllReduce new file mode 100644 index 000000000000..4ba18c34a033 --- /dev/null +++ b/test/dlc_tests/container_tests/bin/pytorch_tests/testNeuronSingleAllReduce @@ -0,0 +1,29 @@ +import os +import torch_xla.core.xla_model as xm +import torch +import torch_xla.distributed.xla_backend +torch.distributed.init_process_group('xla') +import torch_xla.distributed.xla_multiprocessing as xmp +os.environ["NEURON_RT_EXEC_TIMEOUT"] = "20" +os.environ["NCCL_DEBUG"] = "WARN" +os.environ["NCCL_DEBUG_SUBSYS"] = "ALL" +def _mp_fn(): + world_size = xm.xrt_world_size() + device = xm.xla_device() + rank = xm.get_ordinal() + ones = torch.ones((2, 3)) + xones = ones.to(device) + if world_size > 0: + print("running all reduce") + for i in range(0, 5): + print(f'at iteration {i}, with local rank {rank}', flush=True) + result = xm.all_reduce(xm.REDUCE_SUM, xones) + result_cpu = result.cpu() + #xm.mark_step() + print(result_cpu, flush = True) + expected = torch.ones((2,3))*world_size + assert expected.allclose(result_cpu) + print('PASS') +if __name__ == '__main__': + _mp_fn() + #xmp.spawn(_mp_fn, args=(),nprocs=2, join=True) \ No newline at end of file diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py index 0d47d3400e84..3dae82e3ea22 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py @@ -7,7 +7,7 @@ import test.test_utils as test_utils import test.test_utils.ec2 as ec2_utils -from test.test_utils import CONTAINER_TESTS_PREFIX, UBUNTU_18_HPU_DLAMI_US_WEST_2, get_framework_and_version_from_tag, get_cuda_version_from_tag +from test.test_utils import TRN1_REGION, CONTAINER_TESTS_PREFIX, UBUNTU_18_HPU_DLAMI_US_WEST_2, get_framework_and_version_from_tag, get_cuda_version_from_tag from test.test_utils.ec2 import execute_ec2_training_test, get_ec2_instance_type @@ -22,6 +22,8 @@ PT_HABANA_TEST_SUITE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testHabanaPTSuite") PT_TORCHAUDIO_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchaudio") PT_TORCHDATA_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdata") +PT_NEURON_TEST_SCRIPT = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testNeuronSingleAllReduce") +PT_NEURON_TEST_CMD = f"python3 -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 --node_rank=0 --master_addr=localhost --master_port=2022 {PT_NEURON_TEST_SCRIPT} --enable_dist_launch" PT_TORCHDATA_DEV_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdataDev") @@ -34,7 +36,14 @@ default="g3.8xlarge", processor="gpu", filter_function=ec2_utils.filter_only_multi_gpu, ) PT_EC2_HPU_INSTANCE_TYPE = get_ec2_instance_type(default="dl1.24xlarge", processor="hpu") - +PT_EC2_NEURON_TRN1_INSTANCE_TYPE = get_ec2_instance_type(default="trn1.2xlarge", processor="neuron", job_type="training") + +@pytest.mark.parametrize("ec2_instance_ami", [test_utils.NEURON_TRN1_AMI_US_EAST_1], indirect=True) +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_NEURON_TRN1_INSTANCE_TYPE, indirect=True) +@pytest.mark.integration("pytorch_neuron_sanity_test") +@pytest.mark.model("xla") +def test_pytorch_standalone_neuron(pytorch_training_neuron, ec2_connection): + execute_ec2_training_test(ec2_connection, pytorch_training_neuron, PT_NEURON_TEST_CMD) @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("pytorch_sanity_test") diff --git a/test/dlc_tests/sanity/test_pre_release.py b/test/dlc_tests/sanity/test_pre_release.py index 43e4210913ba..c43c349ca787 100644 --- a/test/dlc_tests/sanity/test_pre_release.py +++ b/test/dlc_tests/sanity/test_pre_release.py @@ -37,7 +37,8 @@ get_processor_from_image_uri, execute_env_variables_test, UL20_CPU_ARM64_US_WEST_2, - UBUNTU_18_HPU_DLAMI_US_WEST_2 + UBUNTU_18_HPU_DLAMI_US_WEST_2, + NEURON_UBUNTU_18_BASE_DLAMI_US_WEST_2 ) @@ -430,8 +431,8 @@ def _run_dependency_check_test(image, ec2_connection): "mxnet": {"1.8": ["neuron"], "1.9": ["cpu", "gpu"]}, "pytorch": { "1.8": ["cpu", "gpu"], - "1.10": ["cpu", "hpu"], - "1.11": ["cpu", "gpu", "hpu"], + "1.10": ["cpu", "hpu", "neuron"], + "1.11": ["cpu", "gpu", "hpu", "neuron"], "1.12": ["cpu", "gpu", "hpu"] }, "huggingface_pytorch": {"1.8": ["cpu", "gpu"], "1.9": ["cpu", "gpu"]}, @@ -449,8 +450,8 @@ def _run_dependency_check_test(image, ec2_connection): # Check that these versions have been matched on https://ubuntu.com/security/CVE-2022-1292 before adding allow_openssl_cve_2022_1292_fw_versions = { "pytorch": { - "1.10": ["gpu", "cpu", "hpu"], - "1.11": ["gpu", "cpu", "hpu"], + "1.10": ["gpu", "cpu", "hpu", "neuron"], + "1.11": ["gpu", "cpu", "hpu", "neuron"], "1.12": ["gpu", "cpu", "hpu"], }, "tensorflow": { @@ -574,6 +575,7 @@ def test_dependency_check_hpu(hpu, ec2_connection): @pytest.mark.usefixtures("sagemaker", "huggingface") @pytest.mark.model("N/A") @pytest.mark.parametrize("ec2_instance_type", ["inf1.xlarge"], indirect=True) +@pytest.mark.parametrize("ec2_instance_ami", [NEURON_UBUNTU_18_BASE_DLAMI_US_WEST_2], indirect=True) def test_dependency_check_neuron(neuron, ec2_connection): _run_dependency_check_test(neuron, ec2_connection) diff --git a/test/dlc_tests/sanity/test_safety_check.py b/test/dlc_tests/sanity/test_safety_check.py index 4e4f5313cb8b..4f13d4f63bf2 100644 --- a/test/dlc_tests/sanity/test_safety_check.py +++ b/test/dlc_tests/sanity/test_safety_check.py @@ -434,6 +434,18 @@ "42772", "42814", "42815", + ], + }, + "training-neuron":{ + "_comment":"py2 is deprecated", + "py2": [ + ], + "py3": [ + # not possible for neuron-cc + "43453", + "44715", + "44717", + "44716", # for releasing PT1.12 safety check tools might report a vulnerability for the package commonmarker, # which is a dependency of deepspeed. # This package is only used to build the documentation pages of deepspeed @@ -444,6 +456,10 @@ "48298", # for cryptography until e have 39.0.0 release "51159", + # for Safety. it is test package and not part of image + "51358", + # Ignored- please check https://github.com/pytest-dev/py/issues/287 + "51457", ], }, "inference": { @@ -661,12 +677,14 @@ def _get_safety_ignore_list(image_uri): framework = "tensorflow" job_type = ( - "training" + "training-neuron" + if "training-neuron" in image_uri + else "training" if "training" in image_uri else "inference-eia" if "eia" in image_uri else "inference-neuron" - if "neuron" in image_uri + if "inference-neuron" in image_uri else "inference" ) python_version = "py2" if "py2" in image_uri else "py3" diff --git a/test/sagemaker_tests/pytorch/training/conftest.py b/test/sagemaker_tests/pytorch/training/conftest.py index 1b722fd42bc7..00611db92615 100644 --- a/test/sagemaker_tests/pytorch/training/conftest.py +++ b/test/sagemaker_tests/pytorch/training/conftest.py @@ -49,6 +49,15 @@ dir_path = os.path.dirname(os.path.realpath(__file__)) +NEURON_TRN1_REGIONS = [ + "us-east-1", +] + +NEURON_TRN1_INSTANCES = [ + "ml.trn1.2xlarge", + "ml.trn1.32xlarge" +] + NO_P2_REGIONS = [ "ap-east-1", "ap-northeast-3", @@ -116,7 +125,7 @@ def pytest_addoption(parser): parser.addoption('--region', default='us-west-2') parser.addoption('--framework-version', default='') parser.addoption('--py-version', choices=['2', '3', '37', '38'], default=str(sys.version_info.major)) - parser.addoption('--processor', choices=['gpu', 'cpu'], default='cpu') + parser.addoption('--processor', choices=['gpu', 'cpu', 'neuron'], default='cpu') # If not specified, will default to {framework-version}-{processor}-py{py-version} parser.addoption('--tag', default=None) parser.addoption('--generate-coverage-doc', default=False, action='store_true', @@ -316,6 +325,14 @@ def fixture_dist_gpu_backend(request): @pytest.fixture(autouse=True) def skip_by_device_type(request, use_gpu, instance_type): is_gpu = use_gpu or instance_type[3] in ['g', 'p'] + is_neuron = instance_type in NEURON_TRN1_INSTANCES + + #If neuron run only tests marked as neuron + if (is_neuron and not request.node.get_closest_marker("neuron_test")): + pytest.skip("Skipping because running on \"{}\" instance".format(instance_type)) + if (request.node.get_closest_marker("neuron_test") and not is_neuron): + pytest.skip("Skipping because running on \"{}\" instance".format(instance_type)) + if (request.node.get_closest_marker('skip_gpu') and is_gpu) or \ (request.node.get_closest_marker('skip_cpu') and not is_gpu): pytest.skip('Skipping because running on \'{}\' instance'.format(instance_type)) @@ -346,6 +363,12 @@ def skip_gpu_instance_restricted_regions(region, instance_type): or (region in NO_P4_REGIONS and instance_type.startswith('ml.p4'))): pytest.skip('Skipping GPU test in region {}'.format(region)) +@pytest.fixture(autouse=True) +def skip_neuron_trn1_test_in_region(request, region): + if request.node.get_closest_marker('skip_neuron_trn1_test_in_region'): + if region not in NEURON_TRN1_REGIONS: + pytest.skip('Skipping SageMaker test in region {}'.format(region)) + @pytest.fixture(autouse=True) def skip_py2_containers(request, tag): diff --git a/test/sagemaker_tests/pytorch/training/integration/__init__.py b/test/sagemaker_tests/pytorch/training/integration/__init__.py index 25e1a73f28b3..91025e9870ff 100644 --- a/test/sagemaker_tests/pytorch/training/integration/__init__.py +++ b/test/sagemaker_tests/pytorch/training/integration/__init__.py @@ -30,6 +30,7 @@ data_dir = os.path.join(mnist_path, 'data') training_dir = os.path.join(data_dir, 'training') dist_operations_path = os.path.join(resources_path, 'distributed_operations.py') +neuron_allreduce_path = os.path.join(resources_path, 'neuron') smdebug_mnist_script = os.path.join(mnist_path, 'smdebug_mnist.py') mnist_1d_script = os.path.join(mnist_path, 'mnist_1d.py') diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_neuron.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_neuron.py new file mode 100644 index 000000000000..5f6a08454bd3 --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_neuron.py @@ -0,0 +1,61 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import pytest +import sagemaker +from sagemaker import utils +from sagemaker.pytorch import PyTorch +from ...integration import (neuron_allreduce_path, DEFAULT_TIMEOUT) +from ...integration.sagemaker.timeout import timeout +from .... import invoke_pytorch_helper_function + +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') + + +@pytest.mark.processor("neuron") +@pytest.mark.model("unknown_model") +@pytest.mark.neuron_test +def test_neuron_allreduce_single_node(framework_version, ecr_image, sagemaker_regions, instance_type): + function_args = { + 'framework_version': framework_version, + 'instance_type': instance_type, + 'num_neuron_cores': 2, + } + invoke_pytorch_helper_function(ecr_image, sagemaker_regions, _test_neuron_allreduce, function_args) + +def _test_neuron_allreduce( + ecr_image, sagemaker_session, framework_version, instance_type, instance_count=1, num_neuron_cores=2 +): + with timeout(minutes=DEFAULT_TIMEOUT): + pytorch = PyTorch( + entry_point='entrypoint.py', + source_dir=neuron_allreduce_path, + role='SageMakerRole', + instance_count=instance_count, + instance_type=instance_type, + sagemaker_session=sagemaker_session, + image_uri=ecr_image, + framework_version=framework_version, + hyperparameters={'nproc-per-node': num_neuron_cores, 'nnodes': instance_count}, + disable_profiler=True, + env={"NEURON_RT_LOG_LEVEL": "DEBUG"} + ) + + pytorch.sagemaker_session.default_bucket() + fake_input = pytorch.sagemaker_session.upload_data( + path=neuron_allreduce_path, key_prefix='pytorch/distributed_operations' + ) + + pytorch.fit({'required_argument': fake_input}, job_name=utils.unique_name_from_base('test-pt-neuron-allreduce')) diff --git a/test/sagemaker_tests/pytorch/training/resources/neuron/all_reduce.py b/test/sagemaker_tests/pytorch/training/resources/neuron/all_reduce.py new file mode 100644 index 000000000000..f37f483187ef --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/resources/neuron/all_reduce.py @@ -0,0 +1,44 @@ +import os +import subprocess +import torch_xla.core.xla_model as xm +import torch +import torch_xla.distributed.xla_backend +torch.distributed.init_process_group('xla') +import torch_xla.distributed.xla_multiprocessing as xmp +import logging +import os +import sys + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +logger.addHandler(logging.StreamHandler(sys.stdout)) + +def _mp_fn(): + os.environ['NEURON_CC_FLAGS']= os.environ.get('NEURON_CC_FLAGS', '') + "--cache_dir=neff_cache2" + os.environ['FI_EFA_USE_DEVICE_RDMA'] = '1' + os.environ['FI_PROVIDER'] = 'efa' + os.environ["NCCL_DEBUG"] = "TRACE" + os.environ["NCCL_INIT"] = "TRACE" + os.environ["NCCL_DEBUG_SUBSYS"] = "ALL" + os.environ["NCCL_SOCKET_IFNAME"] = os.environ["SM_NETWORK_INTERFACE_NAME"] + os.environ["NEURON_RT_LOG_LEVEL"] = "INFO" + + world_size = xm.xrt_world_size() + device = xm.xla_device() + rank = xm.get_ordinal() + ones = torch.ones((2, 3)) + xones = ones.to(device) + if world_size > 0: + print("running all reduce") + for i in range(0, 5): + print(f'at iteration {i}, with local rank {rank}', flush=True) + result = xm.all_reduce(xm.REDUCE_SUM, xones) + result_cpu = result.cpu() + xm.mark_step() + print(result_cpu, flush = True) + expected = torch.ones((2,3))*world_size + assert expected.allclose(result_cpu) + logger.info('PASS') + +if __name__ == '__main__': + _mp_fn() diff --git a/test/sagemaker_tests/pytorch/training/resources/neuron/entrypoint.py b/test/sagemaker_tests/pytorch/training/resources/neuron/entrypoint.py new file mode 100644 index 000000000000..ef04111c80db --- /dev/null +++ b/test/sagemaker_tests/pytorch/training/resources/neuron/entrypoint.py @@ -0,0 +1,54 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import sys +import argparse +import json +import logging +import os +import sys +from sagemaker_training import environment + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +logger.addHandler(logging.StreamHandler(sys.stdout)) + +def main(): + print('Starting') + parser = argparse.ArgumentParser() + + parser.add_argument('--nproc-per-node', type=int, default=32) + parser.add_argument('--nnodes', type=int, default=1) + parser.add_argument('--master-port', type=str, default='55555') + parser.add_argument('--nccl-socket-ifname', type=str, default=os.environ["SM_NETWORK_INTERFACE_NAME"]) + parser.add_argument('--train-script-args', type=str, default=" ") + parser.add_argument('--hosts', type=list, default=json.loads(os.environ["SM_HOSTS"])) + + args = parser.parse_args() + env = environment.Environment() + master_addr = env.master_hostname + master_port = args.master_port + current_host = env.current_host + + hosts = args.hosts + node_rank = hosts.index(current_host) + + nccl_socket_ifname = args.nccl_socket_ifname + + torchrun_cmd = f'NEURON_RT_LOG_LEVEL="INFO" FI_EFA_USE_DEVICE_RDMA="1" FI_PROVIDER="efa" NCCL_DEBUG="INFO" NCCL_INIT="INFO" NCCL_DEBUG_SUBSYS="ALL" NCCL_SOCKET_IFNAME={nccl_socket_ifname} torchrun --nproc_per_node={args.nproc_per_node} --nnodes={args.nnodes} --node_rank={node_rank} --master_addr={master_addr} --master_port={master_port} all_reduce.py {args.train_script_args}' + logger.info(f'Calling {torchrun_cmd}') + os.system(torchrun_cmd) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index d64106c55dde..23260da8ae4b 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -34,6 +34,8 @@ DEFAULT_REGION = "us-west-2" # Constant to represent region where p3dn tests can be run P3DN_REGION = "us-east-1" +# Constant to represent region where TRN1 tests can be run +TRN1_REGION = "us-east-1" def get_ami_id_boto3(region_name, ami_name_pattern): """ For a given region and ami name pattern, return the latest ami-id @@ -65,6 +67,8 @@ def get_ami_id_ssm(region_name, parameter_path): PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_WEST_2 = "ami-02d9a47bc61a31d43" # Since latest driver is not in public DLAMI yet, using a custom one NEURON_UBUNTU_18_BASE_DLAMI_US_WEST_2 = get_ami_id_boto3(region_name="us-west-2", ami_name_pattern="Deep Learning Base AMI (Ubuntu 18.04) Version ??.?") +# Since NEURON TRN1 is not released yet use a custom AMI +NEURON_TRN1_AMI_US_EAST_1 = "ami-0a3a08190eb7e5b29" # Habana Base v0.15.4 ami # UBUNTU_18_HPU_DLAMI_US_WEST_2 = "ami-0f051d0c1a667a106" # UBUNTU_18_HPU_DLAMI_US_EAST_1 = "ami-04c47cb3d4fdaa874" @@ -139,6 +143,8 @@ def get_ami_id_ssm(region_name, parameter_path): PUBLIC_DLC_REGISTRY = "763104351884" SAGEMAKER_EXECUTION_REGIONS = ["us-west-2", "us-east-1", "eu-west-1"] +# Before SM GA with Trn1, they support launch of ml.trn1 instance only in us-east-1. After SM GA this can be removed +SAGEMAKER_NEURON_EXECUTION_REGIONS = ["us-east-1"] UPGRADE_ECR_REPO_NAME = "upgraded-image-ecr-scan-repo" ECR_SCAN_HELPER_BUCKET = f"""ecr-scan-helper-{boto3.client("sts", region_name=DEFAULT_REGION).get_caller_identity().get("Account")}""" @@ -1361,6 +1367,11 @@ def get_framework_and_version_from_tag(image_uri): "1.8.0": "1.8.0.2.2.2.0", }, }, + "2.3.0": { + "pytorch": { + "1.11.0": "1.11.0.2.3.0.0", + }, + }, "1.19.1": { "pytorch": { "1.7.1": "1.7.1.2.3.0.0", diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index 29bd051070c5..a8df9b811a6b 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -49,7 +49,7 @@ def filter_not_heavy_instance_types(instance_type_list): return filtered_list -def get_ec2_instance_type(default, processor, filter_function=lambda x: x, efa=False, arch_type=""): +def get_ec2_instance_type(default, processor, filter_function=lambda x: x, efa=False, arch_type="", job_type=""): """ Get EC2 instance type from associated EC2_[CPU|GPU]_INSTANCE_TYPE env variable, or set it to a default for contexts where the variable is not present (i.e. PR, Nightly, local testing) @@ -64,6 +64,7 @@ def get_ec2_instance_type(default, processor, filter_function=lambda x: x, efa=F a list. """ allowed_processors = ("cpu", "gpu", "neuron", "hpu") + job_type_str = f"_{job_type.upper()}" if job_type else "" if processor not in allowed_processors: raise RuntimeError( f"Aborting EC2 test run. Unrecognized processor type {processor}. " @@ -71,9 +72,9 @@ def get_ec2_instance_type(default, processor, filter_function=lambda x: x, efa=F ) if default in HEAVY_INSTANCE_LIST and not efa: raise RuntimeError(f"Default instance type should never be one of {HEAVY_INSTANCE_LIST}, but it is {default}") - instance_type = os.getenv(f"EC2_{processor.upper()}_INSTANCE_TYPE") + instance_type = os.getenv(f"EC2_{processor.upper()}{job_type_str}_INSTANCE_TYPE") if arch_type == "graviton": - instance_type = os.getenv(f"EC2_{processor.upper()}_{arch_type.upper()}_INSTANCE_TYPE") + instance_type = os.getenv(f"EC2_{processor.upper()}_{arch_type.upper()}{job_type_str}_INSTANCE_TYPE") if not instance_type and is_mainline_context(): return [] @@ -636,12 +637,13 @@ def execute_ec2_training_test( ipc = '--ipc=host' if "hpu" in ecr_uri and "pytorch" in ecr_uri else "" hpu_env_vars = f'-e GIT_BRANCH={synapseai_version}' if "hpu" in ecr_uri else "" habana_container_test_repo = '-v ${HOME}/gaudi-test-suite:/gaudi-test-suite' if "hpu" in ecr_uri else "" + neuron_device = '--device=/dev/neuron0' if "neuron" in ecr_uri else "" bin_bash_cmd = "--entrypoint /bin/bash " if bin_bash_entrypoint else "" connection.run( f"{docker_cmd} run --name {container_name} " f"{container_runtime} {ompi_mca_btl} {cap_add} {hpu_env_vars} " f"{ipc} {network}-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} " - f"{habana_container_test_repo} {shm_setting} -itd {bin_bash_cmd}{ecr_uri}", + f"{habana_container_test_repo} {shm_setting} {neuron_device} -itd {bin_bash_cmd}{ecr_uri}", hide=True, ) @@ -675,6 +677,12 @@ def execute_ec2_training_test( LOGGER.info(f"Could not upload the logs") return run_output + #Hack not sure why but see the following. since not using latest driver yet in the AMI, doing this for now + # [ 214.939271] Neuron Driver Started with Version:2.x.381.0-b70a76a18efb5e89ffed987461e9a1009d8b6f1e + # [ 214.939619] neuron-driver 0000:00:1e.0: BAR 4: can't reserve [mem 0x1000000000-0x17ffffffff 64bit pref] + if "neuron" in ecr_uri: + connection.run(f"sudo modprobe -r neuron && sudo modprobe -i neuron") + return connection.run( f"{docker_cmd} exec --user root {container_name} {executable} -c '{test_cmd}'", hide=True, diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py index 2cc7c9d94ee3..ec2f7fa74314 100644 --- a/test/test_utils/sagemaker.py +++ b/test/test_utils/sagemaker.py @@ -22,6 +22,7 @@ get_python_invoker, is_pr_context, SAGEMAKER_EXECUTION_REGIONS, + SAGEMAKER_NEURON_EXECUTION_REGIONS, UBUNTU_18_BASE_DLAMI_US_EAST_1, UBUNTU_18_BASE_DLAMI_US_WEST_2, UL20_CPU_ARM64_US_EAST_1, @@ -45,8 +46,10 @@ class DLCSageMakerLocalTestFailure(Exception): def assign_sagemaker_remote_job_instance_type(image): if "graviton" in image: - return "ml.c6g.2xlarge" - elif "neuron" in image: + return "c6g.2xlarge" + elif "training-neuron" in image: + return "ml.trn1.2xlarge" + elif "inference-neuron" in image: return "ml.inf1.xlarge" elif "gpu" in image: return "ml.p3.8xlarge" @@ -199,7 +202,11 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type): efa_dedicated = os.getenv("EFA_DEDICATED", "False").lower() == "true" efa_flag = '--efa' if efa_dedicated else '-m \"not efa\"' - region_list = ",".join(SAGEMAKER_EXECUTION_REGIONS) + region_list = ( + ",".join(SAGEMAKER_NEURON_EXECUTION_REGIONS) + if "neuron" in image + else ",".join(SAGEMAKER_EXECUTION_REGIONS) + ) sagemaker_regions_list = f"--sagemaker-regions {region_list}" diff --git a/test/test_utils/security.py b/test/test_utils/security.py index 997f1f118715..eaad99f3a943 100644 --- a/test/test_utils/security.py +++ b/test/test_utils/security.py @@ -539,7 +539,7 @@ def get_ecr_vulnerability_package_version(vulnerability): def get_ecr_scan_allowlist_path(image_uri): dockerfile_location = test_utils.get_dockerfile_path_for_image(image_uri) image_scan_allowlist_path = dockerfile_location + ".os_scan_allowlist.json" - if test_utils.is_covered_by_ec2_sm_split(image_uri) and test_utils.is_ec2_sm_in_same_dockerfile(image_uri): + if not any(image_type in image_uri for image_type in ['neuron', 'eia']) and test_utils.is_covered_by_ec2_sm_split(image_uri) and test_utils.is_ec2_sm_in_same_dockerfile(image_uri): if test_utils.is_ec2_image(image_uri): image_scan_allowlist_path = image_scan_allowlist_path.replace("Dockerfile", "Dockerfile.ec2") else: