#### Prerequisites

In [1]:
%%capture 

!pip install sagemaker==2.100.0
!pip install jedi==0.17

#### Imports 

In [2]:
from sagemaker.huggingface import HuggingFaceProcessor
from sagemaker.processing import ProcessingOutput
from sagemaker.processing import ProcessingInput
from sagemaker import get_execution_role
from sagemaker import Session
import sagemaker
import logging

##### Setup logging

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [4]:
logger.info(f'[Using SageMaker version: {sagemaker.__version__}]')

[Using SageMaker version: 2.100.0]


#### Essentials

In [5]:
session = Session()
S3_BUCKET = session.default_bucket()
ROLE = get_execution_role()
S3_INPUT = 'data/covid_articles_clf_data.csv'
S3_OUTPUT = 'data/bert/processed-clf'
INSTANCE_TYPE = 'ml.g4dn.xlarge'
INSTANCE_COUNT = 1
PYTHON_VERSION = '1.6.0'
TRANSFORMERS_VERSION = '4.4.2'
BASE_JOB_NAME = 'hf-sagemaker-processor'
SOURCE_DIR = './src'
CODE = 'preprocess_data_clf_oob.py'

#### View processing script 

In [6]:
!pygmentize -v ./src/preprocess_data_clf_oob.py

[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m TrainingArguments
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m BertTokenizerFast
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m pipeline
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m Trainer
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_dataset
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m DatasetDict
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36mpickle[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m


[37m# Setup logging[39;49;00m
logger = logging.getLogger([31m__name__[39;49;00m)
logging.basicConfig(level=logging.get

#### Create HuggingFace Processor

In [7]:
processor = HuggingFaceProcessor(role=ROLE, 
                                 instance_type=INSTANCE_TYPE, 
                                 instance_count=INSTANCE_COUNT, 
                                 transformers_version=TRANSFORMERS_VERSION, 
                                 pytorch_version=PYTHON_VERSION, 
                                 base_job_name=BASE_JOB_NAME)

#### Run SageMaker Processing job

In [8]:
%%time

processor.run(code=CODE, 
              source_dir=SOURCE_DIR,
              inputs=[ProcessingInput(input_name='article-headlines', 
                                      source=f's3://{S3_BUCKET}/{S3_INPUT}', 
                                      destination='/opt/ml/processing/input/data')],
              outputs=[ProcessingOutput(output_name='processed-data-oob-clf', 
                                        source='/opt/ml/processing/output', 
                                        destination=f's3://{S3_BUCKET}/{S3_OUTPUT}')], 
              wait=False)

Uploaded ./src to s3://sagemaker-us-east-1-119174016168/hf-sagemaker-processor-2022-09-14-19-38-52-933/source/sourcedir.tar.gz
runproc.sh uploaded to s3://sagemaker-us-east-1-119174016168/hf-sagemaker-processor-2022-09-14-19-38-52-933/source/runproc.sh
Creating processing-job with name hf-sagemaker-processor-2022-09-14-19-38-52-933
process request: {
    "ProcessingJobName": "hf-sagemaker-processor-2022-09-14-19-38-52-933",
    "ProcessingResources": {
        "ClusterConfig": {
            "InstanceType": "ml.g4dn.xlarge",
            "InstanceCount": 1,
            "VolumeSizeInGB": 30
        }
    },
    "AppSpecification": {
        "ImageUri": "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04",
        "ContainerEntrypoint": [
            "/bin/bash",
            "/opt/ml/processing/input/entrypoint/runproc.sh"
        ]
    },
    "RoleArn": "arn:aws:iam::119174016168:role/service-role/AmazonSageMaker-Ex


Job Name:  hf-sagemaker-processor-2022-09-14-19-38-52-933
Inputs:  [{'InputName': 'article-headlines', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-119174016168/data/covid_articles_clf_data.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-119174016168/hf-sagemaker-processor-2022-09-14-19-38-52-933/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-119174016168/hf-sagemaker-processor-2022-09-14-19-38-52-933/source/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3Inpu