#### Prerequisites

In [1]:
%%capture 

!pip install sagemaker==2.100.0
!pip install jedi==0.17

#### Imports 

In [2]:
from sagemaker.huggingface import HuggingFaceProcessor
from sagemaker.processing import ProcessingOutput
from sagemaker.processing import ProcessingInput
from sagemaker import get_execution_role
from sagemaker import Session
import sagemaker
import logging

##### Setup logging

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [4]:
logger.info(f'[Using SageMaker version: {sagemaker.__version__}]')

[Using SageMaker version: 2.100.0]


#### Essentials

In [5]:
session = Session()
S3_BUCKET = session.default_bucket()
ROLE = get_execution_role()
S3_INPUT = 'data/covid_articles_clf_data.csv'
S3_VOCAB = 'data/vocab/vocab.txt'
S3_OUTPUT = 'data/processed-clf'
INSTANCE_TYPE = 'ml.g4dn.xlarge'
INSTANCE_COUNT = 1
PYTHON_VERSION = '1.6.0'
TRANSFORMERS_VERSION = '4.4.2'
BASE_JOB_NAME = 'hf-sm-clf-custom-tokenize'
SOURCE_DIR = './src'
CODE = 'preprocess_clf_custom.py'

#### View processing script 

In [6]:
!pygmentize -v ./src/preprocess_clf_custom.py

[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m TrainingArguments
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m BertTokenizerFast
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m BertConfig
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m pipeline
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m Trainer
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_dataset
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m DatasetDict
[34mimport[39;49;00m [04m[36mtransformers[39;49;00m 
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m
[34mimport[39;49;00m [04m[36mdatasets[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m 

#### Create HuggingFace Processor

In [7]:
processor = HuggingFaceProcessor(role=ROLE, 
                                 instance_type=INSTANCE_TYPE, 
                                 instance_count=INSTANCE_COUNT, 
                                 transformers_version=TRANSFORMERS_VERSION, 
                                 pytorch_version=PYTHON_VERSION, 
                                 base_job_name=BASE_JOB_NAME)
logger.info(f'HuggingFace Processor: {processor}')

HuggingFace Processor: <sagemaker.huggingface.processing.HuggingFaceProcessor object at 0x7f3b6c058550>


#### Run SageMaker Processing job

In [8]:
%%time

processor.run(code=CODE, 
              source_dir=SOURCE_DIR,
              inputs=[ProcessingInput(input_name='article-headlines', 
                                      source=f's3://{S3_BUCKET}/{S3_INPUT}', 
                                      destination='/opt/ml/processing/input/data'),
                      ProcessingInput(input_name='bert-vocab',
                                      source=f's3://{S3_BUCKET}/{S3_VOCAB}',
                                      destination='/opt/ml/processing/input/vocab')],
              outputs=[ProcessingOutput(output_name='processed-data-custom-clf', 
                                        source='/opt/ml/processing/output', 
                                        destination=f's3://{S3_BUCKET}/{S3_OUTPUT}')],
              wait=False)

Uploaded ./src to s3://sagemaker-us-east-1-119174016168/hf-sm-clf-custom-tokenize-2022-09-22-21-03-41-412/source/sourcedir.tar.gz
runproc.sh uploaded to s3://sagemaker-us-east-1-119174016168/hf-sm-clf-custom-tokenize-2022-09-22-21-03-41-412/source/runproc.sh
Creating processing-job with name hf-sm-clf-custom-tokenize-2022-09-22-21-03-41-412
process request: {
    "ProcessingJobName": "hf-sm-clf-custom-tokenize-2022-09-22-21-03-41-412",
    "ProcessingResources": {
        "ClusterConfig": {
            "InstanceType": "ml.g4dn.xlarge",
            "InstanceCount": 1,
            "VolumeSizeInGB": 30
        }
    },
    "AppSpecification": {
        "ImageUri": "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04",
        "ContainerEntrypoint": [
            "/bin/bash",
            "/opt/ml/processing/input/entrypoint/runproc.sh"
        ]
    },
    "RoleArn": "arn:aws:iam::119174016168:role/service-role/Amazon


Job Name:  hf-sm-clf-custom-tokenize-2022-09-22-21-03-41-412
Inputs:  [{'InputName': 'article-headlines', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-119174016168/data/covid_articles_clf_data.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'bert-vocab', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-119174016168/data/vocab/vocab.txt', 'LocalPath': '/opt/ml/processing/input/vocab', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-119174016168/hf-sm-clf-custom-tokenize-2022-09-22-21-03-41-412/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': '