#### Prerequisites 

In [2]:
%%capture 

!pip install sagemake==2.100.0
!pip install jedi==0.17  # this is a requirement for pygmentize to work

#### Imports  

In [3]:
from sagemaker.huggingface import HuggingFace
from sagemaker import get_execution_role
from sagemaker import Session
import sagemaker
import logging

##### Setup logging

In [4]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies

In [5]:
logger.info(f'[Using SageMaker: {sagemaker.__version__}]')

[Using SageMaker: 2.100.0]


#### Essentials 

In [6]:
session = Session()
ROLE = get_execution_role()
S3_BUCKET = session.default_bucket()
logger.info(f'S3 bucket = {S3_BUCKET}')

ENTRY_POINT = 'train.py'
SOURCE_DIR = './src'
INSTANCE_TYPE = 'ml.p4d.24xlarge'
INSTANCE_COUNT = 2
EBS_VOLUME_SIZE = 1024
TRANSFORMERS_VERSION = '4.17.0'
PYTORCH_VERSION = '1.10.2'
PYTHON_VERSION = 'py38'

S3 bucket = sagemaker-us-east-1-119174016168


#### View training script

In [7]:
#!pygmentize ./src/train.py

#### Create the estimator 

* Documentation on SageMaker HuggingFace Estimator can be found [here](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/sagemaker.huggingface.html)

In [8]:
DATA = {'train': f's3://{S3_BUCKET}/data'}

In [9]:
MAX_LENGTH = 512  # Context size for BERT tokenizer 
CHUNK_SIZE = 128  
TRAIN_EPOCHS = 1
BATCH_SIZE = 32

In [10]:
HYPERPARAMETERS = {'s3_bucket': S3_BUCKET, 
                   'max_len': MAX_LENGTH,
                   'chunk_size': CHUNK_SIZE,
                   'num_train_epochs': TRAIN_EPOCHS, 
                   'per_device_train_batch_size': BATCH_SIZE}

In [11]:
DISTRIBUTION_STRATEGY = {'smdistributed':{'dataparallel':{ 'enabled': True }}}

In [12]:
huggingface_estimator = HuggingFace(entry_point=ENTRY_POINT, 
                                    source_dir=SOURCE_DIR, 
                                    role=ROLE, 
                                    instance_type=INSTANCE_TYPE, 
                                    instance_count=INSTANCE_COUNT,
                                    volume_size=EBS_VOLUME_SIZE,
                                    hyperparameters=HYPERPARAMETERS,
                                    distribution=DISTRIBUTION_STRATEGY,
                                    transformers_version=TRANSFORMERS_VERSION, 
                                    pytorch_version=PYTORCH_VERSION, 
                                    py_version=PYTHON_VERSION, 
                                    disable_profiler=True,
                                    debugger_hook_config=False)

In [13]:
huggingface_estimator.fit(DATA, wait=False)

Creating training-job with name: huggingface-pytorch-training-2022-08-22-15-43-52-093
train request: {
    "AlgorithmSpecification": {
        "TrainingInputMode": "File",
        "TrainingImage": "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04",
        "EnableSageMakerMetricsTimeSeries": true
    },
    "OutputDataConfig": {
        "S3OutputPath": "s3://sagemaker-us-east-1-119174016168/"
    },
    "TrainingJobName": "huggingface-pytorch-training-2022-08-22-15-43-52-093",
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 86400
    },
    "ResourceConfig": {
        "VolumeSizeInGB": 1024,
        "InstanceCount": 2,
        "InstanceType": "ml.p4d.24xlarge"
    },
    "RoleArn": "arn:aws:iam::119174016168:role/service-role/AmazonSageMaker-ExecutionRole-20211014T093628",
    "InputDataConfig": [
        {
            "DataSource": {
                "S3DataSource": {
                    "S3DataType