#### Prerequisites 

In [1]:
%%capture 

!pip install sagemaker==2.100.0
!pip install jedi==0.17  # this is a requirement for pygmentize to work

#### Imports  

In [2]:
from sagemaker.huggingface import HuggingFace
from sagemaker import get_execution_role
from sagemaker import Session
import sagemaker
import logging

##### Setup logging

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies

In [4]:
logger.info(f'[Using SageMaker: {sagemaker.__version__}]')

[Using SageMaker: 2.100.0]


#### Essentials 

In [5]:
session = Session()
ROLE = get_execution_role()
S3_BUCKET = session.default_bucket()
ENTRY_POINT = 'pretrain.py'
SOURCE_DIR = './src'
INSTANCE_TYPE = 'ml.p4d.24xlarge'
INSTANCE_COUNT = 4
EBS_VOLUME_SIZE = 1024
TRANSFORMERS_VERSION = '4.17.0'
PYTORCH_VERSION = '1.10.2'
PYTHON_VERSION = 'py38'
BASE_JOB_NAME = 'hf-sm-pretrain-scratch'

In [6]:
logger.info(f'S3 bucket = {S3_BUCKET}')

S3 bucket = sagemaker-us-east-1-119174016168


#### View training script

In [7]:
!pygmentize ./src/pretrain.py

[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m DataCollatorForLanguageModeling
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m TrainingArguments
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m BertTokenizerFast
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m BertForMaskedLM
[34mfrom[39;49;00m [04m[36msagemaker[39;49;00m[04m[36m.[39;49;00m[04m[36ms3[39;49;00m [34mimport[39;49;00m S3Downloader
[34mfrom[39;49;00m [04m[36msagemaker[39;49;00m[04m[36m.[39;49;00m[04m[36msession[39;49;00m [34mimport[39;49;00m Session
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m BertConfig
[34mfrom[39;49;00m [04m[36msagemaker[39;49;00m[04m[36m.[39;49;00m[04m[36ms3[39;49;00m [34mimport[39;49;00m S3Uploader
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m pipeline 
[34mfrom[39;49;00m [04m[36

#### Create the estimator 

##### > Documentation on SageMaker HuggingFace Estimator can be found [here](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/sagemaker.huggingface.html)

In [8]:
DATA = {'train': f's3://{S3_BUCKET}/data'}

In [9]:
MAX_LENGTH = 512  # Context size for BERT tokenizer 
CHUNK_SIZE = 128  
TRAIN_EPOCHS = 50
BATCH_SIZE = 32
REGION = 'us-east-1'  # [IMPORTANT] Change this to the region you are running your training job

In [10]:
HYPERPARAMETERS = {'s3_bucket': S3_BUCKET, 
                   'max_len': MAX_LENGTH,
                   'chunk_size': CHUNK_SIZE,
                   'num_train_epochs': TRAIN_EPOCHS, 
                   'per_device_train_batch_size': BATCH_SIZE, 
                   'region': REGION}

In [11]:
DISTRIBUTION_STRATEGY = {'smdistributed':{'dataparallel':{ 'enabled': True }}}

In [12]:
huggingface_estimator = HuggingFace(entry_point=ENTRY_POINT, 
                                    source_dir=SOURCE_DIR, 
                                    role=ROLE, 
                                    instance_type=INSTANCE_TYPE, 
                                    instance_count=INSTANCE_COUNT,
                                    volume_size=EBS_VOLUME_SIZE,
                                    hyperparameters=HYPERPARAMETERS,
                                    distribution=DISTRIBUTION_STRATEGY,
                                    transformers_version=TRANSFORMERS_VERSION, 
                                    pytorch_version=PYTORCH_VERSION, 
                                    py_version=PYTHON_VERSION, 
                                    disable_profiler=True,
                                    debugger_hook_config=False, 
                                    base_job_name=BASE_JOB_NAME)

In [13]:
huggingface_estimator.fit(DATA, wait=False)

Creating training-job with name: hf-sm-pretrain-scratch-2022-09-23-04-00-22-965
train request: {
    "AlgorithmSpecification": {
        "TrainingInputMode": "File",
        "TrainingImage": "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04",
        "EnableSageMakerMetricsTimeSeries": true
    },
    "OutputDataConfig": {
        "S3OutputPath": "s3://sagemaker-us-east-1-119174016168/"
    },
    "TrainingJobName": "hf-sm-pretrain-scratch-2022-09-23-04-00-22-965",
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 86400
    },
    "ResourceConfig": {
        "VolumeSizeInGB": 1024,
        "InstanceCount": 4,
        "InstanceType": "ml.p4d.24xlarge"
    },
    "RoleArn": "arn:aws:iam::119174016168:role/service-role/AmazonSageMaker-ExecutionRole-20211014T093628",
    "InputDataConfig": [
        {
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix