#### Prerequisites

In [2]:
%%capture 

!pip install sagemaker==2.100.0

#### Imports 

In [3]:
from sagemaker.huggingface import HuggingFaceProcessor
from sagemaker.processing import ProcessingOutput
from sagemaker.processing import ProcessingInput
from sagemaker import get_execution_role
from sagemaker import Session
import sagemaker
import logging

##### Setup logging

In [4]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies

In [5]:
logger.info(f'[Using SageMaker version: {sagemaker.__version__}]')

[Using SageMaker version: 2.100.0]


#### Essentials 

In [6]:
session = Session()
S3_BUCKET = session.default_bucket()
ROLE = get_execution_role()

S3_INPUT = 'data/covid_articles.txt'
S3_VOCAB = 'data/vocab'
S3_OUTPUT = 'data/processed'
INSTANCE_TYPE = 'ml.g4dn.xlarge'
INSTANCE_COUNT = 1
PYTHON_VERSION = '1.6.0'
TRANSFORMERS_VERSION = '4.4.2'
BASE_JOB_NAME = 'hf-sm-clm-custom-tokenize'
SOURCE_DIR = './src'
CODE = 'preprocess_clm_custom.py'

In [7]:
logger.info(f'S3 bucket = {S3_BUCKET}')

S3 bucket = sagemaker-us-east-1-119174016168


#### View processing script

In [8]:
!pygmentize -v ./src/preprocess_clm_custom.py

[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m GPT2TokenizerFast
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m GPT2Config
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_dataset
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m DatasetDict
[34mfrom[39;49;00m [04m[36mpathlib[39;49;00m [34mimport[39;49;00m Path
[34mimport[39;49;00m [04m[36mtransformers[39;49;00m 
[34mimport[39;49;00m [04m[36mdatasets[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m


[37m# Setup logging[39;49;00m
logger = logging.getLogger([31m__name__[39;49;00m)
logging.basicConfig(level=logging.getLevelName([33m'[39;49;00m[33mINFO[39;49;00m[33m'[39;49;00m), 
                    handlers=[logging.StreamHandler(sys.stdout)], 
                    [36mformat[39;49;00m=[33

#### Create HuggingFace Processor

In [9]:
processor = HuggingFaceProcessor(role=ROLE, 
                                 instance_type=INSTANCE_TYPE, 
                                 instance_count=INSTANCE_COUNT, 
                                 transformers_version=TRANSFORMERS_VERSION, 
                                 pytorch_version=PYTHON_VERSION, 
                                 base_job_name=BASE_JOB_NAME)

#### Run SageMaker Processing Job

In [10]:
%%time

processor.run(code=CODE, 
              source_dir=SOURCE_DIR,
              inputs=[ProcessingInput(input_name='covid-articles', 
                                      source=f's3://{S3_BUCKET}/{S3_INPUT}', 
                                      destination='/opt/ml/processing/input/data'),
                      ProcessingInput(input_name='custom-vocab',
                                      source=f's3://{S3_BUCKET}/{S3_VOCAB}',
                                      destination='/opt/ml/processing/input/vocab')],
              outputs=[ProcessingOutput(output_name='tokenized-datasets', 
                                        source='/opt/ml/processing/output', 
                                        destination=f's3://{S3_BUCKET}/{S3_OUTPUT}')], 
              wait=False)

Uploaded ./src to s3://sagemaker-us-east-1-119174016168/hf-sm-clm-custom-tokenize-2023-01-25-04-56-01-751/source/sourcedir.tar.gz
runproc.sh uploaded to s3://sagemaker-us-east-1-119174016168/hf-sm-clm-custom-tokenize-2023-01-25-04-56-01-751/source/runproc.sh
Creating processing-job with name hf-sm-clm-custom-tokenize-2023-01-25-04-56-01-751
process request: {
    "ProcessingJobName": "hf-sm-clm-custom-tokenize-2023-01-25-04-56-01-751",
    "ProcessingResources": {
        "ClusterConfig": {
            "InstanceType": "ml.g4dn.xlarge",
            "InstanceCount": 1,
            "VolumeSizeInGB": 30
        }
    },
    "AppSpecification": {
        "ImageUri": "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04",
        "ContainerEntrypoint": [
            "/bin/bash",
            "/opt/ml/processing/input/entrypoint/runproc.sh"
        ]
    },
    "RoleArn": "arn:aws:iam::119174016168:role/service-role/Amazon


Job Name:  hf-sm-clm-custom-tokenize-2023-01-25-04-56-01-751
Inputs:  [{'InputName': 'covid-articles', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-119174016168/data/covid_articles.txt', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'custom-vocab', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-119174016168/data/vocab', 'LocalPath': '/opt/ml/processing/input/vocab', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-119174016168/hf-sm-clm-custom-tokenize-2023-01-25-04-56-01-751/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S