#### Copied and modified from
https://github.com/aws-samples/sagemaker-distributed-training-workshop/blob/main/1_data_parallel/PyTorch%20Lightning%20on%20SageMaker.ipynb

Another important reference, training a T5 model using sagemaker:

https://www.thetechplatform.com/post/conversational-summarization-with-natural-language-processing

### Run this notebook from outside the folder

In [29]:
!pwd

/home/ec2-user/SageMaker


In [19]:
## copy data from s3 to local
#!mkdir data

In [28]:
!aws s3 cp s3://sagemaker-traning-checkpoint/tar-file/processed.tar.gz arxiv_hunter/data

download: s3://sagemaker-traning-checkpoint/tar-file/processed.tar.gz to arxiv_hunter/data/processed.tar.gz


In [2]:
!pip install --upgrade pip

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-22.3.1


In [3]:
!pip install sagemaker
!pip install boto3 --upgrade

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting botocore<1.30.0,>=1.29.20
  Downloading botocore-1.29.33-py3-none-any.whl (10.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0mm
Installing collected packages: botocore
  Attempting uninstall: botocore
    Found existing installation: botocore 1.24.19
    Uninstalling botocore-1.24.19:
      Successfully uninstalled botocore-1.24.19
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.27.20 requires botocore==1.29.20, but you have botocore 1.29.33 which is incompatible.
aiobotocore 2.0.1 requires botocore<1.22.9,>=1.22.8, but you have botocore 1.29.33 which is incompatible.[0m[31m
[0mSuccessfully installed botocore-1.29.33
Looking in indexes: https://pypi.org/simp

In [30]:
import sagemaker
# make sure this is at least 2.102.0
sagemaker.__version__

'2.118.0'

In [31]:
from sagemaker.debugger import ProfilerConfig, FrameworkProfile

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=10)
)

In [6]:
import os 
print(os.getcwd())
os.chdir("..")

/home/ec2-user/SageMaker/arxiv_hunter


In [32]:
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.local import LocalSession
from sagemaker.debugger import TensorBoardOutputConfig

sagemaker_session = sagemaker.Session()
role = os.getenv('SAGEMAKER_ROLE') or sagemaker.get_execution_role()
region = sagemaker_session.boto_region_name

# hard code point to the DLC images
image_uri = '763104351884.dkr.ecr.{}.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker'.format(region)

use_spot_instances = True
max_run = 43000 
max_wait = 58000 if use_spot_instances else None
checkpoint_s3_path = "s3://sagemaker-traning-checkpoint/checkpoints"
logs_s3_path = "s3://sagemaker-tb-logs"

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=f'{logs_s3_path}/tb_logs',
    container_local_output_path='/opt/tb_logs')

estimator = PyTorch(
  entry_point="train_sagemaker.py",
  base_job_name="lightning-ddp-arxiv-hunter",
  image_uri = image_uri,
  role=role,
  source_dir="arxiv_hunter",
  # configures the SageMaker training resource, you can increase as you need
  instance_count=1,
  instance_type="ml.g4dn.12xlarge",
  py_version="py38",
  sagemaker_session=sagemaker_session,
  distribution={"pytorchddp":{"enabled": True}},
  debugger_hook_config=False,
  profiler_config=profiler_config,
  checkpoint_s3_uri = checkpoint_s3_path,
  use_spot_instances=use_spot_instances,
  max_wait=max_wait,
  max_run= max_run,
  tensorboard_output_config=tensorboard_output_config)

In [33]:
estimator.fit(wait=False)
#estimator.fit()

In [None]:
# check profiler report
import sagemaker
from sagemaker.pytorch import PyTorch

rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"
! aws s3 cp {rule_output_path} ./ --recursive